├── LICENSE
├── README.md
├── audio_processing.py
├── filelists
    ├── ljs_audio_text_test_filelist.txt
    ├── ljs_audio_text_train_filelist.txt
    └── ljs_audio_text_val_filelist.txt
├── hparams.py
├── index.html
├── inference.ipynb
├── layers.py
├── modules
    ├── conv.py
    ├── model.py
    └── module.py
├── prepare_data.ipynb
├── requirements.txt
├── stft.py
├── text
    ├── LICENSE
    ├── __init__.py
    ├── cleaners.py
    ├── cmudict.py
    ├── numbers.py
    └── symbols.py
├── train.py
├── utils
    ├── data_utils.py
    ├── plot_image.py
    ├── text2seq.py
    └── utils.py
├── wav_samples
    ├── Ablation study_hierarchy
    │   ├── HelloMyFriends5.0_10.wav
    │   ├── HelloMyFriends5.0_11.wav
    │   ├── HelloMyFriends5.0_12.wav
    │   ├── HelloMyFriends5.0_20.wav
    │   ├── HelloMyFriends5.0_21.wav
    │   ├── HelloMyFriends5.0_22.wav
    │   ├── HelloMyFriends5.0_30.wav
    │   ├── HelloMyFriends5.0_31.wav
    │   ├── HelloMyFriends5.0_32.wav
    │   ├── HelloMyFriends5.0_40.wav
    │   ├── HelloMyFriends5.0_41.wav
    │   ├── HelloMyFriends5.0_42.wav
    │   ├── OneTwoThree5.0_10.wav
    │   ├── OneTwoThree5.0_11.wav
    │   ├── OneTwoThree5.0_12.wav
    │   ├── OneTwoThree5.0_20.wav
    │   ├── OneTwoThree5.0_21.wav
    │   ├── OneTwoThree5.0_22.wav
    │   ├── OneTwoThree5.0_30.wav
    │   ├── OneTwoThree5.0_31.wav
    │   ├── OneTwoThree5.0_32.wav
    │   ├── OneTwoThree5.0_40.wav
    │   ├── OneTwoThree5.0_41.wav
    │   ├── OneTwoThree5.0_42.wav
    │   ├── TrickOrTreat5.0_10.wav
    │   ├── TrickOrTreat5.0_11.wav
    │   ├── TrickOrTreat5.0_12.wav
    │   ├── TrickOrTreat5.0_20.wav
    │   ├── TrickOrTreat5.0_21.wav
    │   ├── TrickOrTreat5.0_22.wav
    │   ├── TrickOrTreat5.0_30.wav
    │   ├── TrickOrTreat5.0_31.wav
    │   ├── TrickOrTreat5.0_32.wav
    │   ├── TrickOrTreat5.0_40.wav
    │   ├── TrickOrTreat5.0_41.wav
    │   └── TrickOrTreat5.0_42.wav
    ├── Ablation study_speed control
    │   ├── Concatenated.wav
    │   ├── Emphasis on one.wav
    │   ├── Emphasis on three.wav
    │   └── Emphasis on two.wav
    └── MOS-ID
    │   ├── 0.GT_wav
    │       ├── LJ002-0253.wav
    │       ├── LJ002-0260.wav
    │       ├── LJ008-0121.wav
    │       ├── LJ011-0141.wav
    │       ├── LJ015-0194.wav
    │       ├── LJ023-0016.wav
    │       ├── LJ028-0145.wav
    │       ├── LJ028-0349.wav
    │       ├── LJ031-0014.wav
    │       └── LJ046-0191.wav
    │   ├── 1.GT_mel
    │       ├── LJ002-0253.wav
    │       ├── LJ002-0260.wav
    │       ├── LJ008-0121.wav
    │       ├── LJ011-0141.wav
    │       ├── LJ015-0194.wav
    │       ├── LJ023-0016.wav
    │       ├── LJ028-0145.wav
    │       ├── LJ028-0349.wav
    │       ├── LJ031-0014.wav
    │       └── LJ046-0191.wav
    │   ├── 2.Tacotron 2
    │       ├── LJ002-0253.wav
    │       ├── LJ002-0260.wav
    │       ├── LJ008-0121.wav
    │       ├── LJ011-0141.wav
    │       ├── LJ015-0194.wav
    │       ├── LJ023-0016.wav
    │       ├── LJ028-0145.wav
    │       ├── LJ028-0349.wav
    │       ├── LJ031-0014.wav
    │       └── LJ046-0191.wav
    │   ├── 3.Glow-TTS
    │       ├── LJ002-0253.wav
    │       ├── LJ002-0260.wav
    │       ├── LJ008-0121.wav
    │       ├── LJ011-0141.wav
    │       ├── LJ015-0194.wav
    │       ├── LJ023-0016.wav
    │       ├── LJ028-0145.wav
    │       ├── LJ028-0349.wav
    │       ├── LJ031-0014.wav
    │       └── LJ046-0191.wav
    │   ├── 4.BVAE-TTS
    │       ├── LJ002-0253.wav
    │       ├── LJ002-0260.wav
    │       ├── LJ008-0121.wav
    │       ├── LJ011-0141.wav
    │       ├── LJ015-0194.wav
    │       ├── LJ023-0016.wav
    │       ├── LJ028-0145.wav
    │       ├── LJ028-0349.wav
    │       ├── LJ031-0014.wav
    │       └── LJ046-0191.wav
    │   ├── 5.BVAE-TTS_nojitter
    │       ├── LJ002-0253.wav
    │       ├── LJ002-0260.wav
    │       ├── LJ008-0121.wav
    │       ├── LJ011-0141.wav
    │       ├── LJ015-0194.wav
    │       ├── LJ023-0016.wav
    │       ├── LJ028-0145.wav
    │       ├── LJ028-0349.wav
    │       ├── LJ031-0014.wav
    │       └── LJ046-0191.wav
    │   └── samples.txt
└── waveglow
    ├── LICENSE
    ├── README.md
    ├── config.json
    ├── convert_model.py
    ├── denoiser.py
    ├── distributed.py
    ├── glow.py
    ├── glow_old.py
    ├── inference.py
    ├── mel2samp.py
    ├── requirements.txt
    ├── train.py
    └── waveglow_logo.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 LEE YOON HYUNG
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bidirectional Variational Inference for Non-Autoregressive Text-to-Speech (BVAE-TTS)  
 2 | ### Yoonhyung Lee, Joongbo Shin, Kyomin Jung  
 3 | **Abstract:** Although early text-to-speech (TTS) models such as Tacotron 2 have succeeded in generating human-like speech, their autoregressive architectures have several limitations: (1) They require a lot of time to generate a mel-spectrogram consisting of hundreds of steps. (2) The autoregressive speech generation shows a lack of robustness due to its error propagation property. In this paper, we propose a novel non-autoregressive TTS model called BVAE-TTS, which eliminates the architectural limitations and generates a mel-spectrogram in parallel. BVAE-TTS adopts a bidirectional-inference variational autoencoder (BVAE) that learns hierarchical latent representations using both bottom-up and top-down paths to increase its expressiveness. To apply BVAE to TTS, we design our model to utilize text information via an attention mechanism. By using attention maps that BVAE-TTS generates, we train a duration predictor so that the model uses the predicted duration of each phoneme at inference. In experiments conducted on LJSpeech dataset, we show that our model generates a mel-spectrogram 27 times faster than Tacotron 2 with similar speech quality. Furthermore, our BVAE-TTS outperforms Glow-TTS, which is one of the state-of-the-art non-autoregressive TTS models, in terms of both speech quality and inference speed while having 58% fewer parameters.
 4 | One-sentence Summary: In this paper, a novel non-autoregressive text-to-speech model based on bidirectional-inference variational autoencoder called BVAE-TTS is proposed.
 5 | 
 6 | 
 7 | ## Training  
 8 | 1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)  
 9 | 2. Make `preprocessed` folder in the LJSpeech directory and do preprocessing of the data using `prepare_data.ipynb`  
10 | 3. Set the `data_path` in `hparams.py` to the `preprocessed` folder  
11 | 4. Train your own BVAE-TTS model  
12 | ```python
13 | python train.py --gpu=0 --logdir=baseline  
14 | ```  
15 | 
16 | 
17 | ## Pre-trained models  
18 | We provide a pre-trained BVAE-TTS model, which is a model that you would obtain with the current setting (e.g. hyperparameters, dataset split). Also, we provide a pre-trained WaveGlow model that is used to obtain the audio samples. After downloading the models, you can generate audio samples using `inference.ipynb`.   
19 | - [BVAE-TTS](http://milabfile.snu.ac.kr:16000/bvae-tts/bvae_tts_300k.pt)  
20 | - [WaveGlow](http://milabfile.snu.ac.kr:16000/bvae-tts/waveglow_256channels.pt)  
21 | 
22 | 
23 | ## Audio Samples  
24 | You can hear the audio samples [here](https://leeyoonhyung.github.io/BVAE-TTS/)  
25 | 
26 | 
27 | ## Reference
28 | 1.NVIDIA/tacotron2: https://github.com/NVIDIA/tacotron2  
29 | 2.NVIDIA/waveglow: https://github.com/NVIDIA/waveglow  
30 | 3.pclucas/iaf-vae: https://github.com/pclucas14/iaf-vae
31 | 


--------------------------------------------------------------------------------
/audio_processing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.signal import get_window
 4 | import librosa.util as librosa_util
 5 | 
 6 | 
 7 | def window_sumsquare(window,
 8 |                      n_frames,
 9 |                      hop_length=200,
10 |                      win_length=800,
11 |                      n_fft=800,
12 |                      dtype=np.float32,
13 |                      norm=None):
14 |     """
15 |     # from librosa 0.6
16 |     Compute the sum-square envelope of a window function at a given hop length.
17 | 
18 |     This is used to estimate modulation effects induced by windowing
19 |     observations in short-time fourier transforms.
20 | 
21 |     Parameters
22 |     ----------
23 |     window : string, tuple, number, callable, or list-like
24 |         Window specification, as in `get_window`
25 | 
26 |     n_frames : int > 0
27 |         The number of analysis frames
28 | 
29 |     hop_length : int > 0
30 |         The number of samples to advance between frames
31 | 
32 |     win_length : [optional]
33 |         The length of the window function.  By default, this matches `n_fft`.
34 | 
35 |     n_fft : int > 0
36 |         The length of each analysis frame.
37 | 
38 |     dtype : np.dtype
39 |         The data type of the output
40 | 
41 |     Returns
42 |     -------
43 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
44 |         The sum-squared envelope of the window function
45 |     """
46 |     if win_length is None:
47 |         win_length = n_fft
48 | 
49 |     n = n_fft + hop_length * (n_frames - 1)
50 |     x = np.zeros(n, dtype=dtype)
51 | 
52 |     # Compute the squared window at the desired length
53 |     win_sq = get_window(window, win_length, fftbins=True)
54 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
55 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
56 | 
57 |     # Fill the envelope
58 |     for i in range(n_frames):
59 |         sample = i * hop_length
60 |         x[sample:min(n, sample+n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
61 |     return x
62 | 
63 | 
64 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
65 |     """
66 |     PARAMS
67 |     ------
68 |     magnitudes: spectrogram magnitudes
69 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
70 |     """
71 | 
72 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
73 |     angles = angles.astype(np.float32)
74 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
75 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
76 | 
77 |     for i in range(n_iters):
78 |         _, angles = stft_fn.transform(signal)
79 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
80 |     return signal
81 | 
82 | 
83 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
84 |     """
85 |     PARAMS
86 |     ------
87 |     C: compression factor
88 |     """
89 |     return torch.log(torch.clamp(x, min=clip_val) * C)
90 | 
91 | 
92 | def dynamic_range_decompression(x, C=1):
93 |     """
94 |     PARAMS
95 |     ------
96 |     C: compression factor used to compress
97 |     """
98 |     return torch.exp(x) / C


--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | LJ022-0023|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
  2 | LJ043-0030|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too.
  3 | LJ005-0201|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five.
  4 | LJ001-0110|Even the Caslon type when enlarged shows great shortcomings in this respect:
  5 | LJ003-0345|All the committee could do in this respect was to throw the responsibility on others.
  6 | LJ007-0154|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated,
  7 | LJ018-0098|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others.
  8 | LJ047-0044|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
  9 | LJ031-0038|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery.
 10 | LJ048-0194|during the morning of November twenty-two prior to the motorcade.
 11 | LJ049-0026|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President.
 12 | LJ004-0152|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four.
 13 | LJ008-0278|or theirs might be one of many, and it might be considered necessary to "make an example."
 14 | LJ043-0002|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald:
 15 | LJ009-0114|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here.
 16 | LJ028-0506|A modern artist would have difficulty in doing such accurate work.
 17 | LJ050-0168|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area
 18 | LJ039-0223|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon
 19 | LJ029-0032|According to O'Donnell, quote, we had a motorcade wherever we went, end quote.
 20 | LJ031-0070|Dr. Clark, who most closely observed the head wound,
 21 | LJ034-0198|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window.
 22 | LJ026-0068|Energy enters the plant, to a small extent,
 23 | LJ039-0075|once you know that you must put the crosshairs on the target and that is all that is necessary.
 24 | LJ004-0096|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized
 25 | LJ005-0014|Speaking on a debate on prison matters, he declared that
 26 | LJ012-0161|he was reported to have fallen away to a shadow.
 27 | LJ018-0239|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to
 28 | LJ019-0257|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines.
 29 | LJ028-0008|you tap gently with your heel upon the shoulder of the dromedary to urge her on.
 30 | LJ024-0083|This plan of mine is no attack on the Court;
 31 | LJ042-0129|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough.
 32 | LJ036-0103|The police asked him whether he could pick out his passenger from the lineup.
 33 | LJ046-0058|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles.
 34 | LJ014-0076|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive.
 35 | LJ002-0043|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen,
 36 | LJ009-0076|We come to the sermon.
 37 | LJ017-0131|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution.
 38 | LJ046-0184|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
 39 | LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
 40 | LJ042-0096|(old exchange rate) in addition to his factory salary of approximately equal amount
 41 | LJ049-0050|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
 42 | LJ019-0186|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties,
 43 | LJ028-0307|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand.
 44 | LJ012-0235|While they were in a state of insensibility the murder was committed.
 45 | LJ034-0053|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
 46 | LJ014-0030|These were damnatory facts which well supported the prosecution.
 47 | LJ015-0203|but were the precautions too minute, the vigilance too close to be eluded or overcome?
 48 | LJ028-0093|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters.
 49 | LJ002-0018|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London,
 50 | LJ028-0275|At last, in the twentieth month,
 51 | LJ012-0042|which he kept concealed in a hiding-place with a trap-door just under his bed.
 52 | LJ011-0096|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm,
 53 | LJ036-0077|Roger D. Craig, a deputy sheriff of Dallas County,
 54 | LJ016-0318|Other officials, great lawyers, governors of prisons, and chaplains supported this view.
 55 | LJ013-0164|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning.
 56 | LJ027-0141|is closely reproduced in the life-history of existing deer. Or, in other words,
 57 | LJ028-0335|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands.
 58 | LJ031-0202|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy.
 59 | LJ021-0145|From those willing to join in establishing this hoped-for period of peace,
 60 | LJ016-0288|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 61 | LJ028-0081|Years later, when the archaeologists could readily distinguish the false from the true,
 62 | LJ018-0081|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him,
 63 | LJ021-0066|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits
 64 | LJ009-0238|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail.
 65 | LJ005-0079|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders.
 66 | LJ035-0019|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal.
 67 | LJ036-0174|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there.
 68 | LJ046-0146|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files
 69 | LJ017-0044|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator.
 70 | LJ017-0070|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
 71 | LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
 72 | LJ016-0020|He never reached the cistern, but fell back into the yard, injuring his legs severely.
 73 | LJ045-0230|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present,
 74 | LJ035-0129|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
 75 | LJ008-0307|afterwards express a wish to murder the Recorder for having kept them so long in suspense.
 76 | LJ008-0294|nearly indefinitely deferred.
 77 | LJ047-0148|On October twenty-five,
 78 | LJ008-0111|They entered a "stone cold room," and were presently joined by the prisoner.
 79 | LJ034-0042|that he could only testify with certainty that the print was less than three days old.
 80 | LJ037-0234|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
 81 | LJ040-0002|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one.
 82 | LJ045-0140|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved
 83 | LJ012-0035|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands.
 84 | LJ012-0250|On the seventh July, eighteen thirty-seven,
 85 | LJ016-0179|contracted with sheriffs and conveners to work by the job.
 86 | LJ016-0138|at a distance from the prison.
 87 | LJ027-0052|These principles of homology are essential to a correct interpretation of the facts of morphology.
 88 | LJ031-0134|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally.
 89 | LJ019-0273|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
 90 | LJ014-0110|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects.
 91 | LJ034-0160|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle.
 92 | LJ038-0199|eleven. If I am alive and taken prisoner,
 93 | LJ014-0010|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came.
 94 | LJ033-0047|I noticed when I went out that the light was on, end quote,
 95 | LJ040-0027|He was never satisfied with anything.
 96 | LJ048-0228|and others who were present say that no agent was inebriated or acted improperly.
 97 | LJ003-0111|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
 98 | LJ008-0258|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days,
 99 | LJ029-0022|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston.
100 | LJ004-0045|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce.


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | from text import symbols
 2 | 
 3 | 
 4 | ################################
 5 | # Experiment Parameters        #
 6 | ################################
 7 | seed=1234
 8 | output_directory = 'training_log'
 9 | iters_per_validation=1000
10 | iters_per_checkpoint=10000
11 | 
12 | data_path = '../Dataset/LJSpeech-1.1/preprocessed'
13 | training_files='filelists/ljs_audio_text_train_filelist.txt'
14 | validation_files='filelists/ljs_audio_text_val_filelist.txt'
15 | test_files='filelists/ljs_audio_text_val_filelist.txt'
16 | text_cleaners=['english_cleaners']
17 | 
18 | 
19 | ################################
20 | # Audio Parameters             #
21 | ################################
22 | sampling_rate=22050
23 | filter_length=1024
24 | hop_length=256
25 | win_length=1024
26 | n_mel_channels=80
27 | mel_fmin=0
28 | mel_fmax=8000.0
29 | 
30 | 
31 | ################################
32 | # Model Parameters             #
33 | ################################
34 | n_symbols=len(symbols)
35 | data_type='phone_seq'
36 | n_blocks=4
37 | n_layers=3
38 | kernel_size=5
39 | downsample_ratio=4
40 | symbols_embedding_dim=256
41 | hidden_dim=256
42 | max_db=2
43 | min_db=-12
44 | 
45 | 
46 | ################################
47 | # Optimization Hyperparameters #
48 | ################################
49 | lr=1e-3
50 | lr_warmup_steps=4000
51 | kl_warmup_steps=60000
52 | grad_clip_thresh=1.0
53 | batch_size=128
54 | train_steps = 300000
55 | 
56 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <html>
  3 | 
  4 | <head>
  5 |   <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  6 |   <title>BVAE-TTS Audio Samples</title>
  7 |   <link rel="stylesheet" type="text/css" href="./resources/stylesheet.css">
  8 | </head>
  9 | 
 10 | <body>
 11 |   <div>
 12 |     <h1>Audio Samples of different TTS models (randomly picked)</h1>
 13 |     <p>LJ028-0349: who were each required to send so large a number to Babylon, that in all there were collected no fewer than fifty thousand.</p>
 14 |     <table>
 15 |       <thead>
 16 |         <tr>
 17 |           <th>Ground truth</th>
 18 |           <th>WaveGlow</th>
 19 |           <th>Tacotron2</th>
 20 |           <th>Glow-TTS</th>
 21 |           <th>BVAE-TTS</th>
 22 |           <th>BVAE-TTS w/o jittering</th>
 23 |         </tr>
 24 |       </thead>
 25 |       <tbody>
 26 |         <tr>
 27 |           <td><audio controls class="audio-player" preload="metadata">
 28 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ028-0349.wav" type="audio/wav"></audio></td>
 29 |           <td><audio controls class="audio-player" preload="metadata">
 30 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ028-0349.wav" type="audio/wav"></audio></td>
 31 |           <td><audio controls class="audio-player" preload="metadata">
 32 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ028-0349.wav" type="audio/wav"></audio></td>
 33 |           <td><audio controls class="audio-player" preload="metadata">
 34 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ028-0349.wav" type="audio/wav"></audio></td>
 35 |           <td><audio controls class="audio-player" preload="metadata">
 36 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ028-0349.wav" type="audio/wav"></audio></td>
 37 |           <td><audio controls class="audio-player" preload="metadata">
 38 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ028-0349.wav" type="audio/wav"></audio></td>
 39 |         </tr>
 40 |       </tbody>
 41 |     </table>
 42 |     <p>LJ002-0260: Yet the public opinion of the whole body seems to have checked dissipation.</p>
 43 |     <table>
 44 |       <thead>
 45 |         <tr>
 46 |           <th>Ground truth</th>
 47 |           <th>WaveGlow</th>
 48 |           <th>Tacotron2</th>
 49 |           <th>Glow-TTS</th>
 50 |           <th>BVAE-TTS</th>
 51 |           <th>BVAE-TTS w/o jittering</th>
 52 |         </tr>
 53 |       </thead>
 54 |       <tbody>
 55 |         <tr>
 56 |           <td><audio controls class="audio-player" preload="metadata">
 57 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ002-0260.wav" type="audio/wav"></audio></td>
 58 |           <td><audio controls class="audio-player" preload="metadata">
 59 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ002-0260.wav" type="audio/wav"></audio></td>
 60 |           <td><audio controls class="audio-player" preload="metadata">
 61 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ002-0260.wav" type="audio/wav"></audio></td>
 62 |           <td><audio controls class="audio-player" preload="metadata">
 63 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ002-0260.wav" type="audio/wav"></audio></td>
 64 |           <td><audio controls class="audio-player" preload="metadata">
 65 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ002-0260.wav" type="audio/wav"></audio></td>
 66 |           <td><audio controls class="audio-player" preload="metadata">
 67 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ002-0260.wav" type="audio/wav"></audio></td>
 68 |         </tr>
 69 |       </tbody>
 70 |     </table>
 71 |     <p>LJ031-0014: the Presidential limousine arrived at the emergency entrance of the Parkland Hospital at about twelve:thirty-five p.m.</p>
 72 |     <table>
 73 |       <thead>
 74 |         <tr>
 75 |           <th>Ground truth</th>
 76 |           <th>WaveGlow</th>
 77 |           <th>Tacotron2</th>
 78 |           <th>Glow-TTS</th>
 79 |           <th>BVAE-TTS</th>
 80 |           <th>BVAE-TTS w/o jittering</th>
 81 |         </tr>
 82 |       </thead>
 83 |       <tbody>
 84 |         <tr>
 85 |           <td><audio controls class="audio-player" preload="metadata">
 86 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ031-0014.wav" type="audio/wav"></audio></td>
 87 |           <td><audio controls class="audio-player" preload="metadata">
 88 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ031-0014.wav" type="audio/wav"></audio></td>
 89 |           <td><audio controls class="audio-player" preload="metadata">
 90 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ031-0014.wav" type="audio/wav"></audio></td>
 91 |           <td><audio controls class="audio-player" preload="metadata">
 92 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ031-0014.wav" type="audio/wav"></audio></td>
 93 |           <td><audio controls class="audio-player" preload="metadata">
 94 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ031-0014.wav" type="audio/wav"></audio></td>
 95 |           <td><audio controls class="audio-player" preload="metadata">
 96 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ031-0014.wav" type="audio/wav"></audio></td>
 97 |         </tr>
 98 |       </tbody>
 99 |     </table>
100 |     <p>LJ046-0191: it had established periodic regular review of the status of four hundred individuals;</p>
101 |     <table>
102 |       <thead>
103 |         <tr>
104 |           <th>Ground truth</th>
105 |           <th>WaveGlow</th>
106 |           <th>Tacotron2</th>
107 |           <th>Glow-TTS</th>
108 |           <th>BVAE-TTS</th>
109 |           <th>BVAE-TTS w/o jittering</th>
110 |         </tr>
111 |       </thead>
112 |       <tbody>
113 |         <tr>
114 |           <td><audio controls class="audio-player" preload="metadata">
115 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ046-0191.wav" type="audio/wav"></audio></td>
116 |           <td><audio controls class="audio-player" preload="metadata">
117 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ046-0191.wav" type="audio/wav"></audio></td>
118 |           <td><audio controls class="audio-player" preload="metadata">
119 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ046-0191.wav" type="audio/wav"></audio></td>
120 |           <td><audio controls class="audio-player" preload="metadata">
121 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ046-0191.wav" type="audio/wav"></audio></td>
122 |           <td><audio controls class="audio-player" preload="metadata">
123 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ046-0191.wav" type="audio/wav"></audio></td>
124 |           <td><audio controls class="audio-player" preload="metadata">
125 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ046-0191.wav" type="audio/wav"></audio></td>
126 |         </tr>
127 |       </tbody>
128 |     </table>
129 |     <p>LJ002-0253: were governed by rules which they themselves had framed, and under which subscriptions were levied</p>
130 |     <table>
131 |       <thead>
132 |         <tr>
133 |           <th>Ground truth</th>
134 |           <th>WaveGlow</th>
135 |           <th>Tacotron2</th>
136 |           <th>Glow-TTS</th>
137 |           <th>BVAE-TTS</th>
138 |           <th>BVAE-TTS w/o jittering</th>
139 |         </tr>
140 |       </thead>
141 |       <tbody>
142 |         <tr>
143 |           <td><audio controls class="audio-player" preload="metadata">
144 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ002-0253.wav" type="audio/wav"></audio></td>
145 |           <td><audio controls class="audio-player" preload="metadata">
146 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ002-0253.wav" type="audio/wav"></audio></td>
147 |           <td><audio controls class="audio-player" preload="metadata">
148 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ002-0253.wav" type="audio/wav"></audio></td>
149 |           <td><audio controls class="audio-player" preload="metadata">
150 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ002-0253.wav" type="audio/wav"></audio></td>
151 |           <td><audio controls class="audio-player" preload="metadata">
152 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ002-0253.wav" type="audio/wav"></audio></td>
153 |           <td><audio controls class="audio-player" preload="metadata">
154 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ002-0253.wav" type="audio/wav"></audio></td>
155 |         </tr>
156 |       </tbody>
157 |     </table>
158 |     <p>LJ008-0121: After the construction and action of the machine had been explained, the doctor asked the governor what kind of men he had commanded at Goree,</p>
159 |     <table>
160 |       <thead>
161 |         <tr>
162 |           <th>Ground truth</th>
163 |           <th>WaveGlow</th>
164 |           <th>Tacotron2</th>
165 |           <th>Glow-TTS</th>
166 |           <th>BVAE-TTS</th>
167 |           <th>BVAE-TTS w/o jittering</th>
168 |         </tr>
169 |       </thead>
170 |       <tbody>
171 |         <tr>
172 |           <td><audio controls class="audio-player" preload="metadata">
173 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ008-0121.wav" type="audio/wav"></audio></td>
174 |           <td><audio controls class="audio-player" preload="metadata">
175 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ008-0121.wav" type="audio/wav"></audio></td>
176 |           <td><audio controls class="audio-player" preload="metadata">
177 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ008-0121.wav" type="audio/wav"></audio></td>
178 |           <td><audio controls class="audio-player" preload="metadata">
179 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ008-0121.wav" type="audio/wav"></audio></td>
180 |           <td><audio controls class="audio-player" preload="metadata">
181 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ008-0121.wav" type="audio/wav"></audio></td>
182 |           <td><audio controls class="audio-player" preload="metadata">
183 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ008-0121.wav" type="audio/wav"></audio></td>
184 |         </tr>
185 |       </tbody>
186 |     </table>
187 |     <p>LJ028-0145: And here I may not omit to tell the use to which the mould dug out of the great moat was turned, nor the manner wherein the wall was wrought.</p>
188 |     <table>
189 |       <thead>
190 |         <tr>
191 |           <th>Ground truth</th>
192 |           <th>WaveGlow</th>
193 |           <th>Tacotron2</th>
194 |           <th>Glow-TTS</th>
195 |           <th>BVAE-TTS</th>
196 |           <th>BVAE-TTS w/o jittering</th>
197 |         </tr>
198 |       </thead>
199 |       <tbody>
200 |         <tr>
201 |           <td><audio controls class="audio-player" preload="metadata">
202 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ028-0145.wav" type="audio/wav"></audio></td>
203 |           <td><audio controls class="audio-player" preload="metadata">
204 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ028-0145.wav" type="audio/wav"></audio></td>
205 |           <td><audio controls class="audio-player" preload="metadata">
206 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ028-0145.wav" type="audio/wav"></audio></td>
207 |           <td><audio controls class="audio-player" preload="metadata">
208 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ028-0145.wav" type="audio/wav"></audio></td>
209 |           <td><audio controls class="audio-player" preload="metadata">
210 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ028-0145.wav" type="audio/wav"></audio></td>
211 |           <td><audio controls class="audio-player" preload="metadata">
212 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ028-0145.wav" type="audio/wav"></audio></td>
213 |         </tr>
214 |       </tbody>
215 |     </table>
216 |     <p>LJ015-0194: and behaved so as to justify a belief that he had been a jail-bird all his life.</p>
217 |     <table>
218 |       <thead>
219 |         <tr>
220 |           <th>Ground truth</th>
221 |           <th>WaveGlow</th>
222 |           <th>Tacotron2</th>
223 |           <th>Glow-TTS</th>
224 |           <th>BVAE-TTS</th>
225 |           <th>BVAE-TTS w/o jittering</th>
226 |         </tr>
227 |       </thead>
228 |       <tbody>
229 |         <tr>
230 |           <td><audio controls class="audio-player" preload="metadata">
231 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ015-0194.wav" type="audio/wav"></audio></td>
232 |           <td><audio controls class="audio-player" preload="metadata">
233 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ015-0194.wav" type="audio/wav"></audio></td>
234 |           <td><audio controls class="audio-player" preload="metadata">
235 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ015-0194.wav" type="audio/wav"></audio></td>
236 |           <td><audio controls class="audio-player" preload="metadata">
237 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ015-0194.wav" type="audio/wav"></audio></td>
238 |           <td><audio controls class="audio-player" preload="metadata">
239 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ015-0194.wav" type="audio/wav"></audio></td>
240 |           <td><audio controls class="audio-player" preload="metadata">
241 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ015-0194.wav" type="audio/wav"></audio></td>
242 |         </tr>
243 |       </tbody>
244 |     </table>
245 |     <p>LJ023-0016: In nineteen thirty-three you and I knew that we must never let our economic system get completely out of joint again</p>
246 |     <table>
247 |       <thead>
248 |         <tr>
249 |           <th>Ground truth</th>
250 |           <th>WaveGlow</th>
251 |           <th>Tacotron2</th>
252 |           <th>Glow-TTS</th>
253 |           <th>BVAE-TTS</th>
254 |           <th>BVAE-TTS w/o jittering</th>
255 |         </tr>
256 |       </thead>
257 |       <tbody>
258 |         <tr>
259 |           <td><audio controls class="audio-player" preload="metadata">
260 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ023-0016.wav" type="audio/wav"></audio></td>
261 |           <td><audio controls class="audio-player" preload="metadata">
262 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ023-0016.wav" type="audio/wav"></audio></td>
263 |           <td><audio controls class="audio-player" preload="metadata">
264 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ023-0016.wav" type="audio/wav"></audio></td>
265 |           <td><audio controls class="audio-player" preload="metadata">
266 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ023-0016.wav" type="audio/wav"></audio></td>
267 |           <td><audio controls class="audio-player" preload="metadata">
268 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ023-0016.wav" type="audio/wav"></audio></td>
269 |           <td><audio controls class="audio-player" preload="metadata">
270 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ023-0016.wav" type="audio/wav"></audio></td>
271 |         </tr>
272 |       </tbody>
273 |     </table>
274 |     <p>LJ011-0141: There were at the moment in Newgate six convicts sentenced to death for forging wills.</p>
275 |     <table>
276 |       <thead>
277 |         <tr>
278 |           <th>Ground truth</th>
279 |           <th>WaveGlow</th>
280 |           <th>Tacotron2</th>
281 |           <th>Glow-TTS</th>
282 |           <th>BVAE-TTS</th>
283 |           <th>BVAE-TTS w/o jittering</th>
284 |         </tr>
285 |       </thead>
286 |       <tbody>
287 |         <tr>
288 |           <td><audio controls class="audio-player" preload="metadata">
289 |               <source src="wav_samples/MOS-ID/0.GT_wav/LJ011-0141.wav" type="audio/wav"></audio></td>
290 |           <td><audio controls class="audio-player" preload="metadata">
291 |               <source src="wav_samples/MOS-ID/1.GT_mel/LJ011-0141.wav" type="audio/wav"></audio></td>
292 |           <td><audio controls class="audio-player" preload="metadata">
293 |               <source src="wav_samples/MOS-ID/2.Tacotron 2/LJ011-0141.wav" type="audio/wav"></audio></td>
294 |           <td><audio controls class="audio-player" preload="metadata">
295 |               <source src="wav_samples/MOS-ID/3.Glow-TTS/LJ011-0141.wav" type="audio/wav"></audio></td>
296 |           <td><audio controls class="audio-player" preload="metadata">
297 |               <source src="wav_samples/MOS-ID/4.BVAE-TTS/LJ011-0141.wav" type="audio/wav"></audio></td>
298 |           <td><audio controls class="audio-player" preload="metadata">
299 |               <source src="wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ011-0141.wav" type="audio/wav"></audio></td>
300 |         </tr>
301 |       </tbody>
302 |     </table>
303 |   </div>
304 |   <br><br><br>
305 |   <div>
306 |     <h1>Audio Samples with higher temperature (T=5.0 for different blocks)</h1>
307 |     <p>Transcript: Hello My Friends</p>
308 |     <table>
309 |       <thead>
310 |         <tr>
311 |           <th>Block</th>
312 |           <th>Sample1</th>
313 |           <th>Sample2</th>
314 |           <th>Sample3</th>
315 |         </tr>
316 |       </thead>
317 |       <tbody>
318 |         <tr>
319 |           <td>Block 1</td>
320 |           <td><audio controls class="audio-player" preload="metadata">
321 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_10.wav" type="audio/wav"></audio></td>
322 |           <td><audio controls class="audio-player" preload="metadata">
323 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_11.wav" type="audio/wav"></audio></td>
324 |           <td><audio controls class="audio-player" preload="metadata">
325 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_12.wav" type="audio/wav"></audio></td>
326 |         </tr>
327 |         <tr>
328 |           <td>Block 2</td>
329 |           <td><audio controls class="audio-player" preload="metadata">
330 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_20.wav" type="audio/wav"></audio></td>
331 |           <td><audio controls class="audio-player" preload="metadata">
332 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_21.wav" type="audio/wav"></audio></td>
333 |           <td><audio controls class="audio-player" preload="metadata">
334 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_22.wav" type="audio/wav"></audio></td>
335 |         </tr>
336 |         <tr>
337 |           <td>Block 3</td>
338 |           <td><audio controls class="audio-player" preload="metadata">
339 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_30.wav" type="audio/wav"></audio></td>
340 |           <td><audio controls class="audio-player" preload="metadata">
341 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_31.wav" type="audio/wav"></audio></td>
342 |           <td><audio controls class="audio-player" preload="metadata">
343 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_32.wav" type="audio/wav"></audio></td>
344 |         </tr>
345 |         <tr>
346 |           <td>Block 4</td>
347 |           <td><audio controls class="audio-player" preload="metadata">
348 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_40.wav" type="audio/wav"></audio></td>
349 |           <td><audio controls class="audio-player" preload="metadata">
350 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_41.wav" type="audio/wav"></audio></td>
351 |           <td><audio controls class="audio-player" preload="metadata">
352 |               <source src="wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_42.wav" type="audio/wav"></audio></td>
353 |         </tr>
354 |       </tbody>
355 |     </table>
356 |     <p>Transcript: One Two Three</p>
357 |     <table>
358 |       <thead>
359 |         <tr>
360 |           <th>Block</th>
361 |           <th>Sample1</th>
362 |           <th>Sample2</th>
363 |           <th>Sample3</th>
364 |         </tr>
365 |       </thead>
366 |       <tbody>
367 |         <tr>
368 |           <td>Block 1</td>
369 |           <td><audio controls class="audio-player" preload="metadata">
370 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_10.wav" type="audio/wav"></audio></td>
371 |           <td><audio controls class="audio-player" preload="metadata">
372 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_11.wav" type="audio/wav"></audio></td>
373 |           <td><audio controls class="audio-player" preload="metadata">
374 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_12.wav" type="audio/wav"></audio></td>
375 |         </tr>
376 |         <tr>
377 |           <td>Block 2</td>
378 |           <td><audio controls class="audio-player" preload="metadata">
379 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_20.wav" type="audio/wav"></audio></td>
380 |           <td><audio controls class="audio-player" preload="metadata">
381 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_21.wav" type="audio/wav"></audio></td>
382 |           <td><audio controls class="audio-player" preload="metadata">
383 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_22.wav" type="audio/wav"></audio></td>
384 |         </tr>
385 |         <tr>
386 |           <td>Block 3</td>
387 |           <td><audio controls class="audio-player" preload="metadata">
388 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_30.wav" type="audio/wav"></audio></td>
389 |           <td><audio controls class="audio-player" preload="metadata">
390 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_31.wav" type="audio/wav"></audio></td>
391 |           <td><audio controls class="audio-player" preload="metadata">
392 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_32.wav" type="audio/wav"></audio></td>
393 |         </tr>
394 |         <tr>
395 |           <td>Block 4</td>
396 |           <td><audio controls class="audio-player" preload="metadata">
397 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_40.wav" type="audio/wav"></audio></td>
398 |           <td><audio controls class="audio-player" preload="metadata">
399 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_41.wav" type="audio/wav"></audio></td>
400 |           <td><audio controls class="audio-player" preload="metadata">
401 |               <source src="wav_samples/Ablation study_hierarchy/OneTwoThree5.0_42.wav" type="audio/wav"></audio></td>
402 |         </tr>
403 |       </tbody>
404 |     </table>
405 |     <p>Transcript: Trick Or Treat</p>
406 |     <table>
407 |       <thead>
408 |         <tr>
409 |           <th>Block</th>
410 |           <th>Sample1</th>
411 |           <th>Sample2</th>
412 |           <th>Sample3</th>
413 |         </tr>
414 |       </thead>
415 |       <tbody>
416 |         <tr>
417 |           <td>Block 1</td>
418 |           <td><audio controls class="audio-player" preload="metadata">
419 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_10.wav" type="audio/wav"></audio></td>
420 |           <td><audio controls class="audio-player" preload="metadata">
421 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_11.wav" type="audio/wav"></audio></td>
422 |           <td><audio controls class="audio-player" preload="metadata">
423 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_12.wav" type="audio/wav"></audio></td>
424 |         </tr>
425 |         <tr>
426 |           <td>Block 2</td>
427 |           <td><audio controls class="audio-player" preload="metadata">
428 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_20.wav" type="audio/wav"></audio></td>
429 |           <td><audio controls class="audio-player" preload="metadata">
430 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_21.wav" type="audio/wav"></audio></td>
431 |           <td><audio controls class="audio-player" preload="metadata">
432 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_22.wav" type="audio/wav"></audio></td>
433 |         </tr>
434 |         <tr>
435 |           <td>Block 3</td>
436 |           <td><audio controls class="audio-player" preload="metadata">
437 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_30.wav" type="audio/wav"></audio></td>
438 |           <td><audio controls class="audio-player" preload="metadata">
439 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_31.wav" type="audio/wav"></audio></td>
440 |           <td><audio controls class="audio-player" preload="metadata">
441 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_32.wav" type="audio/wav"></audio></td>
442 |         </tr>
443 |         <tr>
444 |           <td>Block 4</td>
445 |           <td><audio controls class="audio-player" preload="metadata">
446 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_40.wav" type="audio/wav"></audio></td>
447 |           <td><audio controls class="audio-player" preload="metadata">
448 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_41.wav" type="audio/wav"></audio></td>
449 |           <td><audio controls class="audio-player" preload="metadata">
450 |               <source src="wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_42.wav" type="audio/wav"></audio></td>
451 |         </tr>
452 |       </tbody>
453 |     </table>
454 |   </div>
455 |   <br><br><br>
456 |   <div>
457 |     <h1>Speed Control</h1>
458 |     <p>Transcript: One Two Three</p>
459 |     <table>
460 |       <thead>
461 |         <tr>
462 |           <th>Emphasis on One</th>
463 |           <th>Emphasis on Two</th>
464 |           <th>Emphasis on Three</th>
465 |           <th>Concatenated</th>
466 |         </tr>
467 |       </thead>
468 |       <tbody>
469 |         <tr>
470 |           <td><audio controls class="audio-player" preload="metadata">
471 |               <source src="wav_samples/Ablation study_speed control/Emphasis on one.wav" type="audio/wav"></audio></td>
472 |           <td><audio controls class="audio-player" preload="metadata">
473 |               <source src="wav_samples/Ablation study_speed control/Emphasis on two.wav" type="audio/wav"></audio></td>
474 |           <td><audio controls class="audio-player" preload="metadata">
475 |               <source src="wav_samples/Ablation study_speed control/Emphasis on three.wav" type="audio/wav"></audio></td>
476 |           <td><audio controls class="audio-player" preload="metadata">
477 |               <source src="wav_samples/Ablation study_speed control/Concatenated.wav" type="audio/wav"></audio></td>
478 |         </tr>
479 |       </tbody>    
480 |     </table>
481 |   </div>
482 |   <br><br><br>
483 | </body>
484 | </html>
485 | 


--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Import libraries and setup matplotlib"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n",
 18 |     "\n",
 19 |     "import warnings\n",
 20 |     "warnings.filterwarnings(\"ignore\")\n",
 21 |     "\n",
 22 |     "import sys\n",
 23 |     "sys.path.append('waveglow/')\n",
 24 |     "\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "%matplotlib inline\n",
 27 |     "\n",
 28 |     "import IPython.display as ipd\n",
 29 |     "from text import *\n",
 30 |     "import torch\n",
 31 |     "import hparams as hp\n",
 32 |     "from modules.model import Model\n",
 33 |     "from denoiser import Denoiser\n",
 34 |     "from utils.utils import *\n",
 35 |     "from utils.text2seq import text2seq\n",
 36 |     "\n",
 37 |     "waveglow_path = 'training_log/waveglow_256channels.pt'\n",
 38 |     "waveglow = torch.load(waveglow_path)['model']\n",
 39 |     "\n",
 40 |     "for m in waveglow.modules():\n",
 41 |     "    if 'Conv' in str(type(m)):\n",
 42 |     "        setattr(m, 'padding_mode', 'zeros')\n",
 43 |     "\n",
 44 |     "waveglow.cuda().eval()\n",
 45 |     "for k in waveglow.convinv:\n",
 46 |     "    k.float()\n",
 47 |     "\n",
 48 |     "denoiser = Denoiser(waveglow)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Generate samples"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "scrolled": false
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "checkpoint_path = f\"training_log/baseline/bvae_tts_300k.pt\"\n",
 67 |     "model = Model(hp).cuda()\n",
 68 |     "model.load_state_dict(torch.load(checkpoint_path)['state_dict'])\n",
 69 |     "_ = model.cuda().eval()\n",
 70 |     "\n",
 71 |     "with open('filelists/ljs_audio_text_test_filelist.txt', 'r') as f:\n",
 72 |     "    test_data = f.read().splitlines()\n",
 73 |     "    \n",
 74 |     "for i, x in enumerate(test_data[:10]):\n",
 75 |     "    file, text = x.split('|')\n",
 76 |     "    print(f\"{file}: {text}\")\n",
 77 |     "    phone_seq = text2seq(text)\n",
 78 |     "    sequence = torch.autograd.Variable(torch.from_numpy(phone_seq)).cuda().long().unsqueeze(0)\n",
 79 |     "\n",
 80 |     "    temperature=[0.333, 0.333, 0.333, 0.333]\n",
 81 |     "    with torch.no_grad():\n",
 82 |     "        melspec, durations = model.inference(sequence, alpha=1.0, temperature=temperature)\n",
 83 |     "        melspec = melspec*(hp.max_db-hp.min_db)+hp.min_db\n",
 84 |     "        audio = waveglow.infer(melspec, sigma=0.666)\n",
 85 |     "        audio_denoised = denoiser(audio, strength=0.03)[:, 0]\n",
 86 |     "        ipd.display(ipd.Audio(audio_denoised.cpu().numpy(), rate=22050))\n"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "kernelspec": {
 92 |    "display_name": "Environment (conda_pytorch_p36)",
 93 |    "language": "python",
 94 |    "name": "conda_pytorch_p36"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 3
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython3",
106 |    "version": "3.6.10"
107 |   },
108 |   "varInspector": {
109 |    "cols": {
110 |     "lenName": 16,
111 |     "lenType": 16,
112 |     "lenVar": 40
113 |    },
114 |    "kernels_config": {
115 |     "python": {
116 |      "delete_cmd_postfix": "",
117 |      "delete_cmd_prefix": "del ",
118 |      "library": "var_list.py",
119 |      "varRefreshCmd": "print(var_dic_list())"
120 |     },
121 |     "r": {
122 |      "delete_cmd_postfix": ") ",
123 |      "delete_cmd_prefix": "rm(",
124 |      "library": "var_list.r",
125 |      "varRefreshCmd": "cat(var_dic_list()) "
126 |     }
127 |    },
128 |    "types_to_exclude": [
129 |     "module",
130 |     "function",
131 |     "builtin_function_or_method",
132 |     "instance",
133 |     "_Feature"
134 |    ],
135 |    "window_display": false
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 2
140 | }
141 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from librosa.filters import mel as librosa_mel_fn
 3 | from audio_processing import dynamic_range_compression
 4 | from audio_processing import dynamic_range_decompression
 5 | from stft import STFT
 6 | 
 7 | 
 8 | class LinearNorm(torch.nn.Module):
 9 |     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
10 |         super(LinearNorm, self).__init__()
11 |         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
12 | 
13 |         torch.nn.init.xavier_uniform_(
14 |             self.linear_layer.weight,
15 |             gain=torch.nn.init.calculate_gain(w_init_gain))
16 | 
17 |     def forward(self, x):
18 |         return self.linear_layer(x)
19 | 
20 | 
21 | class ConvNorm(torch.nn.Module):
22 |     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
23 |                  padding=None, dilation=1, bias=True, w_init_gain='linear'):
24 |         super(ConvNorm, self).__init__()
25 |         if padding is None:
26 |             assert(kernel_size % 2 == 1)
27 |             padding = int(dilation * (kernel_size - 1) / 2)
28 | 
29 |         self.conv = torch.nn.Conv1d(in_channels, out_channels,
30 |                                     kernel_size=kernel_size, stride=stride,
31 |                                     padding=padding, dilation=dilation,
32 |                                     bias=bias)
33 | 
34 |         torch.nn.init.xavier_uniform_(
35 |             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
36 | 
37 |     def forward(self, signal):
38 |         conv_signal = self.conv(signal)
39 |         return conv_signal
40 | 
41 | 
42 | class TacotronSTFT(torch.nn.Module):
43 |     def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
44 |                  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
45 |                  mel_fmax=8000.0):
46 |         super(TacotronSTFT, self).__init__()
47 |         self.n_mel_channels = n_mel_channels
48 |         self.sampling_rate = sampling_rate
49 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
50 |         mel_basis = librosa_mel_fn(
51 |             sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
52 |         mel_basis = torch.from_numpy(mel_basis).float()
53 |         self.register_buffer('mel_basis', mel_basis)
54 | 
55 |     def spectral_normalize(self, magnitudes):
56 |         output = dynamic_range_compression(magnitudes)
57 |         return output
58 | 
59 |     def spectral_de_normalize(self, magnitudes):
60 |         output = dynamic_range_decompression(magnitudes)
61 |         return output
62 | 
63 |     def mel_spectrogram(self, y):
64 |         """Computes mel-spectrograms from a batch of waves
65 |         PARAMS
66 |         ------
67 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
68 |         RETURNS
69 |         -------
70 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
71 |         """
72 |         assert(torch.min(y.data) >= -1)
73 |         assert(torch.max(y.data) <= 1)
74 | 
75 |         magnitudes, phases = self.stft_fn.transform(y)
76 |         magnitudes = magnitudes.data
77 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
78 |         mel_output = self.spectral_normalize(mel_output)
79 |         return mel_output


--------------------------------------------------------------------------------
/modules/conv.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Linear(nn.Linear):
 6 |     def __init__(self,
 7 |                  in_dim,
 8 |                  out_dim,
 9 |                  bias=True,
10 |                  w_init_gain='linear'):
11 |         super(Linear, self).__init__(in_dim,
12 |                                      out_dim,
13 |                                      bias)
14 |         nn.init.xavier_uniform_(self.weight,
15 |                                 gain=nn.init.calculate_gain(w_init_gain))
16 | 
17 | 
18 | 
19 | class Conv1d(nn.Conv1d):
20 |     def __init__(self, *args, activation=None, **kwargs):
21 |         super(Conv1d, self).__init__(*args, **kwargs)
22 |         self.padding = (self.dilation[0]*(self.kernel_size[0]-1))//2
23 |         self.act=None
24 |         nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('linear'))
25 |         
26 |         if not activation is None:
27 |             self.act = activation
28 |             nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
29 | 
30 |     def forward(self, inputs, mask=None):
31 |         if self.act is None:
32 |             outputs = super(Conv1d, self).forward(inputs)
33 |         else:
34 |             outputs = self.act(super(Conv1d, self).forward(inputs))
35 |         
36 |         if mask is None:
37 |             return outputs
38 |         else:
39 |             outputs = outputs.masked_fill(mask.unsqueeze(1), 0)
40 |             return outputs
41 | 


--------------------------------------------------------------------------------
/modules/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .module import *
  5 | from .conv import *
  6 | from utils.utils import *
  7 | 
  8 | 
  9 | class Model(nn.Module):
 10 |     def __init__(self, hp):
 11 |         super(Model, self).__init__()
 12 |         self.hp=hp
 13 |         self.ratio=hp.downsample_ratio
 14 |         self.text_mask=None
 15 |         self.mel_mask=None
 16 |         self.diag_mask = None
 17 | 
 18 |         # build network
 19 |         self.Prenet = Prenet(hp)
 20 |         self.TextEnc = TextEnc(hp)
 21 |         
 22 |         self.BVAE_blocks = nn.ModuleList()
 23 |         for i in range(hp.n_blocks):
 24 |             ForT= 'F' if i%2==0 else 'T'
 25 |             self.BVAE_blocks.append(BVAE_block(hp.hidden_dim//2**(i//2+1),
 26 |                                                hp.kernel_size,
 27 |                                                hp.n_layers,
 28 |                                                down_upsample=ForT))
 29 |             
 30 |         self.Query = Conv1d(hp.hidden_dim//self.ratio, hp.hidden_dim, hp.kernel_size, bias=False)
 31 |         self.Compress = Linear(hp.hidden_dim, hp.hidden_dim//self.ratio, bias=False)
 32 |         self.Projection = Projection(hp.hidden_dim, hp.kernel_size, hp.n_mel_channels)
 33 |         
 34 |         # duration predictor
 35 |         self.Duration = DurationPredictor(hp)
 36 | 
 37 |         
 38 |     def forward(self, text, melspec, text_lengths, mel_lengths):
 39 |         ##### Prepare Mask#####
 40 |         self.text_mask, self.mel_mask, self.diag_mask = self.prepare_mask(text_lengths, mel_lengths)
 41 |         
 42 |         ##### Text #####
 43 |         key, value = self.TextEnc(text, self.text_mask)
 44 |         
 45 |         ##### Bottom_Up #####
 46 |         query=self.bottom_up(melspec, self.mel_mask)
 47 |         
 48 |         ##### Alignment #####
 49 |         h, align = self.get_align(query, key, value, text_lengths, mel_lengths, self.text_mask, self.mel_mask)
 50 | 
 51 |         ##### Top_Down #####
 52 |         mel_pred, kl_loss = self.top_down(h, self.mel_mask)
 53 |         
 54 |         ##### Compute Loss #####
 55 |         duration_out = self.get_duration(value, self.text_mask)
 56 |         recon_loss, duration_loss, align_loss = self.compute_loss(mel_pred,
 57 |                                                                   melspec,
 58 |                                                                   duration_out,
 59 |                                                                   align,
 60 |                                                                   mel_lengths,
 61 |                                                                   self.text_mask,
 62 |                                                                   self.mel_mask,
 63 |                                                                   self.diag_mask)
 64 |         
 65 |         return recon_loss, kl_loss, duration_loss, align_loss
 66 |     
 67 |     
 68 |     def prepare_mask(self, text_lengths, mel_lengths):
 69 |         B, L, T = text_lengths.size(0), text_lengths.max().item(), mel_lengths.max().item()
 70 |         text_mask = get_mask_from_lengths(text_lengths)
 71 |         mel_mask = get_mask_from_lengths(mel_lengths)
 72 |         x = (torch.arange(L).float().unsqueeze(0).to(text_lengths.device)/text_lengths.unsqueeze(1)).unsqueeze(1)\
 73 |              - (torch.arange(T//self.ratio).float().unsqueeze(0).to(text_lengths.device)/(mel_lengths//self.ratio).unsqueeze(1)).unsqueeze(2)
 74 |         diag_mask = (-12.5*torch.pow(x, 2)).exp()
 75 |         diag_mask = diag_mask.masked_fill(text_mask.unsqueeze(1), 0)
 76 |         diag_mask = diag_mask.masked_fill(mel_mask[:,::self.ratio].unsqueeze(-1), 0)
 77 |         
 78 |         return text_mask, mel_mask, diag_mask
 79 |         
 80 |         
 81 |     def bottom_up(self, melspec, mel_mask):
 82 |         x = self.Prenet(melspec, mel_mask)
 83 |         for i, block in enumerate(self.BVAE_blocks):
 84 |             x = block.up(x, mel_mask[:, ::2**((i+1)//2)])
 85 |         
 86 |         query = self.Query(x, mel_mask[:,::self.ratio]).transpose(1,2)
 87 |         
 88 |         return query
 89 |     
 90 |     
 91 |     def top_down(self, h, mel_mask):
 92 |         kl = 0
 93 |         for i, block in enumerate(reversed(self.BVAE_blocks)):
 94 |             h, curr_kl = block.down(h, mel_mask[:, ::2**(len(self.BVAE_blocks)//2-(i+1)//2)])
 95 |             kl += curr_kl
 96 |             
 97 |         mel_pred = self.Projection(h, mel_mask)
 98 |         
 99 |         return mel_pred, kl
100 | 
101 |     
102 |     def get_align(self, q, k, v, text_lengths, mel_lengths, text_mask, mel_mask):
103 |         q = q + PositionalEncoding(self.hp.hidden_dim, mel_lengths//self.ratio)
104 |         k = k + PositionalEncoding(self.hp.hidden_dim, text_lengths, 1.0*mel_lengths/self.ratio/text_lengths)
105 |         
106 |         q = q * self.hp.hidden_dim ** -0.5
107 |         scores = torch.bmm(q, k.transpose(1, 2))
108 |         scores = scores.masked_fill(text_mask.unsqueeze(1), -float('inf'))
109 |         
110 |         align = scores.softmax(-1)
111 |         align = align.masked_fill(mel_mask[:,::self.ratio].unsqueeze(-1), 0)
112 |         if self.training:
113 |             align_oh = self.jitter(F.one_hot(align.max(-1)[1], align.size(-1)), mel_lengths)
114 |         else:
115 |             align_oh = F.one_hot(align.max(-1)[1], align.size(-1))
116 |         align_oh = align_oh.masked_fill(mel_mask[:,::self.ratio].unsqueeze(-1), 0)
117 |         
118 |         attn_output = torch.bmm(align + (align_oh-align).detach(), v)
119 |         attn_output = self.Compress(attn_output).transpose(1,2)
120 |         
121 |         return attn_output, align
122 |     
123 |        
124 |     def compute_loss(self, mel_pred, mel_target, duration_out, align, mel_lengths, text_mask, mel_mask, diag_mask):
125 |         # Recon Loss
126 |         recon_loss = nn.L1Loss()(mel_pred.masked_select(~mel_mask.unsqueeze(1)),
127 |                                  mel_target.masked_select(~mel_mask.unsqueeze(1)))
128 | 
129 |         # Duration Loss
130 |         duration_target = self.align2duration(align, mel_lengths)
131 |         duration_target_flat = duration_target.masked_select(~text_mask)
132 |         duration_target_flat[duration_target_flat<=0]=1
133 |         duration_out_flat = duration_out.masked_select(~text_mask)
134 |         duration_loss = nn.MSELoss()( torch.log(duration_out_flat+1e-5), torch.log(duration_target_flat+1e-5) )
135 |         
136 |         # Guide Loss
137 |         align_losses = align*(1-diag_mask)
138 |         align_loss = torch.mean(align_losses.masked_select(diag_mask.bool()))
139 |         
140 |         return recon_loss, duration_loss, align_loss
141 |         
142 |     
143 |     def inference(self, text, alpha=1.0, temperature=1.0):
144 |         assert len(text)==1, 'You must encode only one sentence at once'
145 |         text_lengths = torch.tensor([text.size(1)]).to(text.device)
146 |         key, value = self.TextEnc(text)
147 |         durations = self.get_duration(value)
148 |         h, durations = self.LengthRegulator(value, durations, alpha)
149 |         h = self.Compress(h).transpose(1,2)
150 |         
151 |         if isinstance(temperature, float):
152 |             temperature=[temperature]*len(self.BVAE_blocks)
153 |             
154 |         for i, block in enumerate(reversed(self.BVAE_blocks)):
155 |             h, _ = block.down(h, sample=True, temperature=temperature[i])
156 | 
157 |         mel_out = self.Projection(h)
158 |         
159 |         return mel_out, durations
160 |     
161 |     
162 |     def get_duration(self, value, mask=None):
163 |         durations = self.Duration(value.transpose(1,2).detach(), mask)
164 |         return durations
165 |     
166 |     
167 |     def align2duration(self, alignments, mel_lengths):
168 |         max_ids = torch.max(alignments, dim=2)[1]
169 |         max_ids_oh = F.one_hot(max_ids, alignments.size(2))
170 |         mask = get_mask_from_lengths(mel_lengths//self.ratio).unsqueeze(-1)
171 |         max_ids_oh.masked_fill_(mask, 0)
172 |         durations = max_ids_oh.sum(dim=1).to(torch.float)
173 |         return durations
174 |     
175 |     
176 |     def LengthRegulator(self, hidden_states, durations, alpha=1.0):
177 |         durations = torch.round(durations*alpha).to(torch.long)
178 |         durations[durations<=0]=1
179 |         return hidden_states.repeat_interleave(durations[0], dim=1), durations
180 |     
181 |     
182 |     def jitter(self, alignments, mel_lengths):
183 |         B, T, _ = alignments.size()
184 |         batch_indices = torch.arange(B).unsqueeze(1).to(alignments.device)
185 |         jitter_indices = torch.arange(T).unsqueeze(0).repeat(B,1).to(alignments.device)
186 |         jitter_indices = torch.round(jitter_indices + (2*torch.rand(jitter_indices.size())-1).to(alignments.device)).to(torch.long)
187 |         jitter_indices = torch.where(jitter_indices<(mel_lengths//self.ratio).unsqueeze(1),
188 |                                      jitter_indices,
189 |                                      ((mel_lengths//self.ratio)-1).unsqueeze(-1).repeat(1,T))
190 |         jitter_indices[jitter_indices<=0]=0
191 |         alignments = alignments[batch_indices, jitter_indices]
192 |         alignments.masked_fill_(self.mel_mask[:,::self.ratio].unsqueeze(-1), 0)
193 |         return alignments
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/modules/module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.nn.utils.spectral_norm as sn
  5 | import torch.distributions as D
  6 | from .conv import *
  7 | 
  8 | 
  9 | class Softplus(torch.autograd.Function):
 10 |     @staticmethod
 11 |     def forward(ctx, i):
 12 |         result=torch.log(1+torch.exp(i))
 13 |         ctx.save_for_backward(i)
 14 |         return result
 15 |     
 16 |     @staticmethod
 17 |     def backward(ctx, grad_output):
 18 |         return grad_output*torch.sigmoid(ctx.saved_variables[0])
 19 | 
 20 | 
 21 | class CustomSoftplus(nn.Module):
 22 |     def forward(self, input_tensor):
 23 |         return Softplus.apply(input_tensor)
 24 | 
 25 | 
 26 | 
 27 | class TextEnc(nn.Module):
 28 |     def __init__(self, hp):
 29 |         super(TextEnc, self).__init__()
 30 |         self.Embedding = nn.Embedding(hp.n_symbols, hp.hidden_dim)
 31 |         self.conv_layers = nn.ModuleList([Conv1d(hp.hidden_dim, 2*hp.hidden_dim, hp.kernel_size) for _ in range(7)])
 32 |     
 33 |     def forward(self, text, mask=None):
 34 |         embedded = F.dropout(self.Embedding(text), 0.1, training=self.training)
 35 |         x = embedded.transpose(1,2)
 36 |         
 37 |         for conv in self.conv_layers:
 38 |             x1, x2 = torch.chunk( conv(x, mask), 2, dim=1)
 39 |             x = (x1 * torch.sigmoid(x2) + x) / 2**0.5
 40 |             x = F.dropout(x, 0.1, training=self.training)
 41 |             
 42 |         key = x.transpose(1, 2)
 43 |         value = (key+embedded)/2**0.5
 44 |         return key, value
 45 | 
 46 | 
 47 | 
 48 | class BVAE_block(nn.Module):
 49 |     def __init__(self, hdim, kernel_size, n_layers, down_upsample):
 50 |         super(BVAE_block, self).__init__()
 51 |         self.down_upsample=down_upsample
 52 |         self.BVAE_layers = nn.ModuleList()
 53 |         for i in range(n_layers):
 54 |             self.BVAE_layers.append(BVAE_layer(hdim,
 55 |                                                kernel_size,
 56 |                                                dilation=2**i,
 57 |                                                adj_dim=( (down_upsample=='F') and (i==0) )))
 58 |         
 59 |     def up(self, inputs, mask=None):
 60 |         if self.down_upsample=='T':
 61 |             inputs = self.blur_pool(inputs, mask)
 62 |         x = inputs
 63 |         for layer in self.BVAE_layers:
 64 |             x = layer.up(x, mask)
 65 | 
 66 |         return x
 67 |     
 68 |     
 69 |     def down(self, inputs, mask=None, sample=False, temperature=1.0):
 70 |         x = inputs
 71 |         kl=0
 72 |         for layer in reversed(self.BVAE_layers):
 73 |             x, curr_kl = layer.down(x, mask, sample, temperature) 
 74 |             kl += curr_kl
 75 |         
 76 |         if self.down_upsample=='T':
 77 |             x = x.repeat_interleave(2,-1)
 78 |         
 79 |         return x, kl
 80 |     
 81 |     
 82 |     def blur_pool(self, x, mask):
 83 |         blur_kernel = (torch.tensor([[[0.25,0.5,0.25]]])).repeat(x.size(1),1,1).to(x.device)
 84 |         outputs = F.conv1d(x, blur_kernel, padding=1, stride=2, groups=x.size(1))
 85 |         outputs = outputs.masked_fill(mask.unsqueeze(1), 0)
 86 |         return outputs
 87 |     
 88 |     
 89 |     
 90 | class BVAE_layer(nn.Module):
 91 |     def __init__(self, hdim, kernel_size, dilation=1, adj_dim=False):
 92 |         super(BVAE_layer, self).__init__()
 93 |         self.softplus = CustomSoftplus()
 94 |         
 95 |         ####################### BOTTOM_UP #########################
 96 |         if adj_dim==True:
 97 |             self.pre_conv = Conv1d(2*hdim, hdim, kernel_size, activation=F.elu, dilation=dilation)
 98 |         else:
 99 |             self.pre_conv = Conv1d(hdim, hdim, kernel_size, activation=F.elu, dilation=dilation)
100 |             
101 |         self.up_conv_a = nn.ModuleList([sn(Conv1d(hdim, hdim, kernel_size, activation=F.elu)),
102 |                                         sn(Conv1d(hdim, 3*hdim, kernel_size, bias=False))])
103 |         self.up_conv_b = sn(Conv1d(hdim, hdim, kernel_size, activation=F.elu))
104 |         
105 |         ######################## TOP_DOWN ##########################
106 |         self.down_conv_a = nn.ModuleList([sn(Conv1d(hdim, hdim, kernel_size, activation=F.elu)),
107 |                                           sn(Conv1d(hdim, 5*hdim, kernel_size, bias=False))])
108 |         self.down_conv_b = nn.ModuleList([sn(Conv1d(2*hdim, hdim, kernel_size, bias=False)),
109 |                                           sn(Conv1d(hdim, hdim, kernel_size, activation=F.elu))])
110 |         
111 |         if adj_dim==True:
112 |             self.post_conv = Conv1d(hdim, 2*hdim, kernel_size, activation=F.elu, dilation=dilation)
113 |         else:
114 |             self.post_conv = Conv1d(hdim, hdim, kernel_size, activation=F.elu, dilation=dilation)
115 |         
116 |         
117 |     def up(self, inputs, mask=None):
118 |         inputs = self.pre_conv(inputs, mask)
119 |         x = self.up_conv_a[0](inputs, mask)
120 |         self.qz_mean, self.qz_std, h = self.up_conv_a[1](x, mask).chunk(3, 1)
121 |         self.qz_std = self.softplus(self.qz_std)
122 |         h = self.up_conv_b(h, mask)
123 | 
124 |         return (inputs+h)/2**0.5
125 |     
126 |     
127 |     def down(self, inputs, mask=None, sample=False, temp=1):
128 |         x = self.down_conv_a[0](inputs, mask)
129 |         pz_mean, pz_std, rz_mean, rz_std, h = self.down_conv_a[1](x, mask).chunk(5, 1)
130 |         pz_std, rz_std = self.softplus(pz_std), self.softplus(rz_std)
131 |         
132 |         if sample==True:
133 |             prior = D.Normal(pz_mean, pz_std*temp)
134 |             z = prior.rsample()
135 |             kl = torch.zeros(inputs.size(0)).to(inputs.device).mean()
136 |             
137 |         else:
138 |             prior = D.Normal(pz_mean, pz_std)
139 |             posterior = D.Normal(pz_mean+self.qz_mean+rz_mean, pz_std*self.qz_std*rz_std)
140 |             z = posterior.rsample().masked_fill(mask.unsqueeze(1), 0)
141 |             kl = D.kl.kl_divergence(posterior, prior).mean()
142 |             
143 |         h = torch.cat((z, h), 1)
144 |         h = self.down_conv_b[0](h, mask)
145 |         h = self.down_conv_b[1](h, mask)
146 |         outputs = self.post_conv((inputs+h)/2**0.5, mask)
147 |         
148 |         return outputs, kl
149 |     
150 | 
151 | class DurationPredictor(nn.Module):
152 |     def __init__(self, hp):
153 |         super(DurationPredictor, self).__init__()
154 |         self.conv1 = Conv1d(hp.hidden_dim, hp.hidden_dim, 3, bias=False, activation=F.elu)
155 |         self.conv2 = Conv1d(hp.hidden_dim, hp.hidden_dim, 3, bias=False, activation=F.elu)
156 |         
157 |         self.ln1 = nn.LayerNorm(hp.hidden_dim)
158 |         self.ln2 = nn.LayerNorm(hp.hidden_dim)
159 |         self.dropout = nn.Dropout(0.1)
160 |         
161 |         self.linear = Linear(hp.hidden_dim, 1)
162 | 
163 |     def forward(self, h, mask=None):
164 |         x = self.conv1(h, mask)
165 |         x = self.dropout(self.ln1(x.transpose(1,2)))
166 |         x = self.conv2(x.transpose(1,2), mask)
167 |         x = self.dropout(self.ln2(x.transpose(1,2)))
168 |         out = self.linear(x).exp()+1
169 |         
170 |         return out.squeeze(-1)
171 |     
172 | 
173 | 
174 | class Prenet(nn.Module):
175 |     def __init__(self, hp):
176 |         super(Prenet, self).__init__()
177 |         self.layers = nn.ModuleList([Conv1d(hp.n_mel_channels, hp.hidden_dim, 1, bias=True, activation=F.elu),
178 |                                      Conv1d(hp.hidden_dim, hp.hidden_dim, 1, bias=True, activation=F.elu)])
179 | 
180 |     def forward(self, x, mask=None):
181 |         for i, layer in enumerate(self.layers):
182 |             x = F.dropout(layer(x, mask), 0.5, training=True)
183 |         return x
184 | 
185 | 
186 | 
187 | class Projection(nn.Module):
188 |     def __init__(self, hdim, kernel_size, outdim):
189 |         super(Projection, self).__init__()
190 |         self.layers=nn.ModuleList([Conv1d(hdim, hdim, kernel_size, activation=F.elu),
191 |                                    Conv1d(hdim, hdim, kernel_size, activation=F.elu),
192 |                                    Conv1d(hdim, outdim, kernel_size)])
193 |         
194 |     def forward(self, x, mask=None):
195 |         for i, layer in enumerate(self.layers):
196 |             if i<len(self.layers)-1:
197 |                 x = F.dropout(layer(x, mask), 0.5, training=self.training)
198 |             else:
199 |                 x = layer(x, mask)
200 |         return torch.sigmoid(x)
201 | 
202 | 


--------------------------------------------------------------------------------
/prepare_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2019-12-18T06:01:56.263558Z",
  9 |      "start_time": "2019-12-18T06:01:51.717351Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import os\n",
 15 |     "import librosa\n",
 16 |     "from tqdm import tqdm\n",
 17 |     "import torch\n",
 18 |     "import numpy as np\n",
 19 |     "import codecs\n",
 20 |     "from utils.text2seq import text2seq\n",
 21 |     "from layers import TacotronSTFT\n",
 22 |     "import hparams as hp\n",
 23 |     "\n",
 24 |     "csv_file = '../Dataset/LJSpeech-1.1/metadata.csv'\n",
 25 |     "root_dir = '../Dataset/LJSpeech-1.1/wavs'\n",
 26 |     "data_dir = '../Dataset/LJSpeech-1.1/preprocessed'\n",
 27 |     "\n",
 28 |     "stft = TacotronSTFT()\n",
 29 |     "def get_mel(filename):\n",
 30 |     "    wav, sr = librosa.load(filename, sr=hp.sampling_rate)\n",
 31 |     "    wav = torch.FloatTensor(wav.astype(np.float32))\n",
 32 |     "    \n",
 33 |     "    ### trimming ###\n",
 34 |     "    start = torch.where(torch.abs(wav)>(torch.abs(wav).max()*0.05))[0][0]\n",
 35 |     "    end = torch.where(torch.abs(wav)>(torch.abs(wav).max()*0.05))[0][-1]\n",
 36 |     "    \n",
 37 |     "    ### 50ms silence padding ###\n",
 38 |     "    wav = torch.nn.functional.pad(wav[start:end], (0, hp.sampling_rate//20))\n",
 39 |     "    melspec = stft.mel_spectrogram(wav.unsqueeze(0))\n",
 40 |     "    \n",
 41 |     "    return melspec.squeeze(0), wav\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "if not os.path.exists(f'{data_dir}'):\n",
 45 |     "    os.mkdir(f'{data_dir}')\n",
 46 |     "if not os.path.exists(f'{data_dir}/phone_seq'):\n",
 47 |     "    os.mkdir(f'{data_dir}/phone_seq')\n",
 48 |     "if not os.path.exists(f'{data_dir}/melspectrogram'):\n",
 49 |     "    os.mkdir(f'{data_dir}/melspectrogram')\n",
 50 |     "\n",
 51 |     "\n",
 52 |     "with codecs.open(csv_file, 'r', 'utf-8') as f:\n",
 53 |     "    for line in tqdm(f.readlines()):\n",
 54 |     "        fname, _, text = line.split(\"|\")\n",
 55 |     "        wav_name = os.path.join(root_dir, fname) + '.wav'\n",
 56 |     "        phone_seq = text2seq(text)\n",
 57 |     "        melspec, wav = get_mel(wav_name)\n",
 58 |     "        np.save(f'{data_dir}/phone_seq/{fname}_sequence.npy', phone_seq)\n",
 59 |     "        np.save(f'{data_dir}/melspectrogram/{fname}_melspectrogram.npy', melspec.numpy())\n",
 60 |     "    \n",
 61 |     "print(\"FINISH DATA PREPROCESSING!!!\")"
 62 |    ]
 63 |   }
 64 |  ],
 65 |  "metadata": {
 66 |   "kernelspec": {
 67 |    "display_name": "Environment (conda_pytorch_p36)",
 68 |    "language": "python",
 69 |    "name": "conda_pytorch_p36"
 70 |   },
 71 |   "language_info": {
 72 |    "codemirror_mode": {
 73 |     "name": "ipython",
 74 |     "version": 3
 75 |    },
 76 |    "file_extension": ".py",
 77 |    "mimetype": "text/x-python",
 78 |    "name": "python",
 79 |    "nbconvert_exporter": "python",
 80 |    "pygments_lexer": "ipython3",
 81 |    "version": "3.6.10"
 82 |   },
 83 |   "varInspector": {
 84 |    "cols": {
 85 |     "lenName": 16,
 86 |     "lenType": 16,
 87 |     "lenVar": 40
 88 |    },
 89 |    "kernels_config": {
 90 |     "python": {
 91 |      "delete_cmd_postfix": "",
 92 |      "delete_cmd_prefix": "del ",
 93 |      "library": "var_list.py",
 94 |      "varRefreshCmd": "print(var_dic_list())"
 95 |     },
 96 |     "r": {
 97 |      "delete_cmd_postfix": ") ",
 98 |      "delete_cmd_prefix": "rm(",
 99 |      "library": "var_list.r",
100 |      "varRefreshCmd": "cat(var_dic_list()) "
101 |     }
102 |    },
103 |    "types_to_exclude": [
104 |     "module",
105 |     "function",
106 |     "builtin_function_or_method",
107 |     "instance",
108 |     "_Feature"
109 |    ],
110 |    "window_display": false
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 2
115 | }
116 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.1.3
2 | tensorboard
3 | numpy==1.22.0
4 | inflect==4.0.0
5 | librosa==0.7.1
6 | scipy==1.4.1
7 | Unidecode==1.1.1
8 | pillow==9.0.1
9 | g2p_en==2.0.0


--------------------------------------------------------------------------------
/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | 
  4 | Copyright (c) 2017, Prem Seetharaman
  5 | All rights reserved.
  6 | 
  7 | * Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted provided that the following conditions are met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright notice,
 11 |   this list of conditions and the following disclaimer.
 12 | 
 13 | * Redistributions in binary form must reproduce the above copyright notice, this
 14 |   list of conditions and the following disclaimer in the
 15 |   documentation and/or other materials provided with the distribution.
 16 | 
 17 | * Neither the name of the copyright holder nor the names of its
 18 |   contributors may be used to endorse or promote products derived from this
 19 |   software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | """
 32 | 
 33 | import torch
 34 | import numpy as np
 35 | import torch.nn.functional as F
 36 | from torch.autograd import Variable
 37 | from scipy.signal import get_window
 38 | from librosa.util import pad_center, tiny
 39 | from audio_processing import window_sumsquare
 40 | 
 41 | 
 42 | class STFT(torch.nn.Module):
 43 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 44 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 45 |                  window='hann'):
 46 |         super(STFT, self).__init__()
 47 |         self.filter_length = filter_length
 48 |         self.hop_length = hop_length
 49 |         self.win_length = win_length
 50 |         self.window = window
 51 |         self.forward_transform = None
 52 |         scale = self.filter_length / self.hop_length
 53 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 54 | 
 55 |         cutoff = int((self.filter_length / 2 + 1))
 56 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 57 |                                    np.imag(fourier_basis[:cutoff, :])])
 58 | 
 59 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 60 |         inverse_basis = torch.FloatTensor(
 61 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 62 | 
 63 |         if window is not None:
 64 |             assert(filter_length >= win_length)
 65 |             # get window and zero center pad it to filter_length
 66 |             fft_window = get_window(window, win_length, fftbins=True)
 67 |             fft_window = pad_center(fft_window, filter_length)
 68 |             fft_window = torch.from_numpy(fft_window).float()
 69 | 
 70 |             # window the bases
 71 |             forward_basis *= fft_window
 72 |             inverse_basis *= fft_window
 73 | 
 74 |         self.register_buffer('forward_basis', forward_basis.float())
 75 |         self.register_buffer('inverse_basis', inverse_basis.float())
 76 | 
 77 |     def transform(self, input_data):
 78 |         num_batches = input_data.size(0)
 79 |         num_samples = input_data.size(1)
 80 | 
 81 |         self.num_samples = num_samples
 82 | 
 83 |         # similar to librosa, reflect-pad the input
 84 |         input_data = input_data.view(num_batches, 1, num_samples)
 85 |         input_data = F.pad(
 86 |             input_data.unsqueeze(1),
 87 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 88 |             mode='reflect')
 89 |         input_data = input_data.squeeze(1)
 90 | 
 91 |         forward_transform = F.conv1d(
 92 |             input_data,
 93 |             Variable(self.forward_basis, requires_grad=False),
 94 |             stride=self.hop_length,
 95 |             padding=0)
 96 | 
 97 |         cutoff = int((self.filter_length / 2) + 1)
 98 |         real_part = forward_transform[:, :cutoff, :]
 99 |         imag_part = forward_transform[:, cutoff:, :]
100 | 
101 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
102 |         phase = torch.autograd.Variable(
103 |             torch.atan2(imag_part.data, real_part.data))
104 | 
105 |         return magnitude, phase
106 | 
107 |     def inverse(self, magnitude, phase):
108 |         recombine_magnitude_phase = torch.cat(
109 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
110 | 
111 |         inverse_transform = F.conv_transpose1d(
112 |             recombine_magnitude_phase,
113 |             Variable(self.inverse_basis, requires_grad=False),
114 |             stride=self.hop_length,
115 |             padding=0)
116 | 
117 |         if self.window is not None:
118 |             window_sum = window_sumsquare(
119 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
120 |                 win_length=self.win_length, n_fft=self.filter_length,
121 |                 dtype=np.float32)
122 |             # remove modulation effects
123 |             approx_nonzero_indices = torch.from_numpy(
124 |                 np.where(window_sum > tiny(window_sum))[0])
125 |             window_sum = torch.autograd.Variable(
126 |                 torch.from_numpy(window_sum), requires_grad=False)
127 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
128 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
129 | 
130 |             # scale by hop ratio
131 |             inverse_transform *= float(self.filter_length) / self.hop_length
132 | 
133 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
134 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
135 | 
136 |         return inverse_transform
137 | 
138 |     def forward(self, input_data):
139 |         self.magnitude, self.phase = self.transform(input_data)
140 |         reconstruction = self.inverse(self.magnitude, self.phase)
141 |         return reconstruction
142 | 


--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from text import cleaners
 6 | from text.symbols import symbols
 7 | 
 8 | # Mappings from symbol to numeric ID and vice versa:
 9 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
10 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
11 | 
12 | # Regular expression matching text enclosed in curly braces:
13 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
14 | 
15 | 
16 | def text_to_sequence(text, cleaner_names):
17 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
18 | 
19 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
20 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
21 | 
22 |     Args:
23 |       text: string to convert to a sequence
24 |       cleaner_names: names of the cleaner functions to run the text through
25 | 
26 |     Returns:
27 |       List of integers corresponding to the symbols in the text
28 |   '''
29 |   sequence = [_symbol_to_id['^']]
30 | 
31 |   # Check for curly braces and treat their contents as ARPAbet:
32 |   while len(text):
33 |     m = _curly_re.match(text)
34 |     if not m:
35 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
36 |       break
37 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
38 |     sequence += _arpabet_to_sequence(m.group(2))
39 |     text = m.group(3)
40 | 
41 |   # Append EOS token
42 |   sequence.append(_symbol_to_id['~'])
43 |   return sequence
44 | 
45 | 
46 | def sequence_to_text(sequence):
47 |   '''Converts a sequence of IDs back to a string'''
48 |   result = ''
49 |   for symbol_id in sequence:
50 |     if symbol_id in _id_to_symbol:
51 |       s = _id_to_symbol[symbol_id]
52 |       # Enclose ARPAbet back in curly braces:
53 |       if len(s) > 1 and s[0] == '@':
54 |         s = '{%s}' % s[1:]
55 |       result += s
56 |   return result.replace('}{', ' ')
57 | 
58 | 
59 | def _clean_text(text, cleaner_names):
60 |   for name in cleaner_names:
61 |     cleaner = getattr(cleaners, name)
62 |     if not cleaner:
63 |       raise Exception('Unknown cleaner: %s' % name)
64 |     text = cleaner(text)
65 |   return text
66 | 
67 | 
68 | def _symbols_to_sequence(symbols):
69 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
70 | 
71 | 
72 | def _arpabet_to_sequence(text):
73 |   return _symbols_to_sequence(['@' + s for s in text.split()])
74 | 
75 | 
76 | def _should_keep_symbol(s):
77 |   return s in _symbol_to_id and s is not '_' and s is not '~'
78 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """This file is derived from https://github.com/keithito/tacotron.
  2 | 
  3 | Cleaners are transformations that run over the input text at both training and eval time.
  4 | 
  5 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  6 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  7 | 1. "english_cleaners" for English text
  8 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
  9 |    the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 10 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 11 |    the symbols in symbols.py to match your data).
 12 | """
 13 | 
 14 | import re
 15 | 
 16 | from unidecode import unidecode
 17 | 
 18 | from text.numbers import normalize_numbers
 19 | 
 20 | # Regular expression matching whitespace:
 21 | _whitespace_re = re.compile(r'\s+')
 22 | 
 23 | # List of (regular expression, replacement) pairs for abbreviations:
 24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 25 |     ('mrs', 'misess'),
 26 |     ('mr', 'mister'),
 27 |     ('dr', 'doctor'),
 28 |     ('st', 'saint'),
 29 |     ('co', 'company'),
 30 |     ('jr', 'junior'),
 31 |     ('maj', 'major'),
 32 |     ('gen', 'general'),
 33 |     ('drs', 'doctors'),
 34 |     ('rev', 'reverend'),
 35 |     ('lt', 'lieutenant'),
 36 |     ('hon', 'honorable'),
 37 |     ('sgt', 'sergeant'),
 38 |     ('capt', 'captain'),
 39 |     ('esq', 'esquire'),
 40 |     ('ltd', 'limited'),
 41 |     ('col', 'colonel'),
 42 |     ('ft', 'fort'),
 43 | ]]
 44 | 
 45 | 
 46 | def expand_abbreviations(text):
 47 |     for regex, replacement in _abbreviations:
 48 |         text = re.sub(regex, replacement, text)
 49 |     return text
 50 | 
 51 | 
 52 | def expand_numbers(text):
 53 |     return normalize_numbers(text)
 54 | 
 55 | 
 56 | def lowercase(text):
 57 |     return text.lower()
 58 | 
 59 | 
 60 | def collapse_whitespace(text):
 61 |     return re.sub(_whitespace_re, ' ', text)
 62 | 
 63 | 
 64 | def convert_to_ascii(text):
 65 |     return unidecode(text)
 66 | 
 67 | 
 68 | def basic_cleaners(text):
 69 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
 70 |     text = lowercase(text)
 71 |     text = collapse_whitespace(text)
 72 |     return text
 73 | 
 74 | 
 75 | def transliteration_cleaners(text):
 76 |     '''Pipeline for non-English text that transliterates to ASCII.'''
 77 |     text = convert_to_ascii(text)
 78 |     text = lowercase(text)
 79 |     text = collapse_whitespace(text)
 80 |     return text
 81 | 
 82 | 
 83 | def english_cleaners(text):
 84 |     '''Pipeline for English text, including number and abbreviation expansion.'''
 85 |     text = convert_to_ascii(text)
 86 |     text = lowercase(text)
 87 |     text = expand_numbers(text)
 88 |     text = expand_abbreviations(text)
 89 |     text = collapse_whitespace(text)
 90 |     return text
 91 | 
 92 | 
 93 | # NOTE (kan-bayashi): Following functions additionally defined, not inclueded in original codes.
 94 | def remove_unnecessary_symbols(text):
 95 |     # added
 96 |     text = re.sub(r'[\(\)\[\]\<\>\"]+', '', text)
 97 |     return text
 98 | 
 99 | 
100 | def expand_symbols(text):
101 |     # added
102 |     text = re.sub("\;", ",", text)
103 |     text = re.sub("\:", ",", text)
104 |     text = re.sub("\-", " ", text)
105 |     text = re.sub("\&", "and", text)
106 |     return text
107 | 
108 | 
109 | def uppercase(text):
110 |     # added
111 |     return text.upper()
112 | 
113 | 
114 | def custom_english_cleaners(text):
115 |     '''Custom pipeline for English text, including number and abbreviation expansion.'''
116 |     text = convert_to_ascii(text)
117 |     text = lowercase(text)
118 |     text = expand_numbers(text)
119 |     text = expand_abbreviations(text)
120 |     text = expand_symbols(text)
121 |     text = remove_unnecessary_symbols(text)
122 |     text = uppercase(text)
123 |     text = collapse_whitespace(text)
124 |     
125 |     # There is an exception (I found it!)
126 |     # "'NOW FOR YOU, MY POOR FELLOW MORTALS, WHO ARE ABOUT TO SUFFER THE LAST PENALTY OF THE LAW.'"
127 |     if text[0]=="'" and text[-1]=="'":
128 |         text = text[1:-1]
129 |         
130 |     return text
131 | 


--------------------------------------------------------------------------------
/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | valid_symbols = [
 7 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 8 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 9 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14 | ]
15 | 
16 | _valid_symbol_set = set(valid_symbols)
17 | 
18 | 
19 | class CMUDict:
20 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21 |   def __init__(self, file_or_path, keep_ambiguous=True):
22 |     if isinstance(file_or_path, str):
23 |       with open(file_or_path, encoding='latin-1') as f:
24 |         entries = _parse_cmudict(f)
25 |     else:
26 |       entries = _parse_cmudict(file_or_path)
27 |     if not keep_ambiguous:
28 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29 |     self._entries = entries
30 | 
31 | 
32 |   def __len__(self):
33 |     return len(self._entries)
34 | 
35 | 
36 |   def lookup(self, word):
37 |     '''Returns list of ARPAbet pronunciations of the given word.'''
38 |     return self._entries.get(word.upper())
39 | 
40 | 
41 | 
42 | _alt_re = re.compile(r'\([0-9]+\)')
43 | 
44 | 
45 | def _parse_cmudict(file):
46 |   cmudict = {}
47 |   for line in file:
48 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49 |       parts = line.split('  ')
50 |       word = re.sub(_alt_re, '', parts[0])
51 |       pronunciation = _get_pronunciation(parts[1])
52 |       if pronunciation:
53 |         if word in cmudict:
54 |           cmudict[word].append(pronunciation)
55 |         else:
56 |           cmudict[word] = [pronunciation]
57 |   return cmudict
58 | 
59 | 
60 | def _get_pronunciation(s):
61 |   parts = s.strip().split(' ')
62 |   for part in parts:
63 |     if part not in _valid_symbol_set:
64 |       return None
65 |   return ' '.join(parts)
66 | 


--------------------------------------------------------------------------------
/text/numbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """ from https://github.com/keithito/tacotron """
 3 | 
 4 | import inflect
 5 | import re
 6 | 
 7 | 
 8 | _inflect = inflect.engine()
 9 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
10 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
11 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
12 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
13 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
14 | _number_re = re.compile(r'[0-9]+')
15 | 
16 | 
17 | def _remove_commas(m):
18 |   return m.group(1).replace(',', '')
19 | 
20 | 
21 | def _expand_decimal_point(m):
22 |   return m.group(1).replace('.', ' point ')
23 | 
24 | 
25 | def _expand_dollars(m):
26 |   match = m.group(1)
27 |   parts = match.split('.')
28 |   if len(parts) > 2:
29 |     return match + ' dollars'  # Unexpected format
30 |   dollars = int(parts[0]) if parts[0] else 0
31 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
32 |   if dollars and cents:
33 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
34 |     cent_unit = 'cent' if cents == 1 else 'cents'
35 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
36 |   elif dollars:
37 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
38 |     return '%s %s' % (dollars, dollar_unit)
39 |   elif cents:
40 |     cent_unit = 'cent' if cents == 1 else 'cents'
41 |     return '%s %s' % (cents, cent_unit)
42 |   else:
43 |     return 'zero dollars'
44 | 
45 | 
46 | def _expand_ordinal(m):
47 |   return _inflect.number_to_words(m.group(0))
48 | 
49 | 
50 | def _expand_number(m):
51 |   num = int(m.group(0))
52 |   if num > 1000 and num < 3000:
53 |     if num == 2000:
54 |       return 'two thousand'
55 |     elif num > 2000 and num < 2010:
56 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
57 |     elif num % 100 == 0:
58 |       return _inflect.number_to_words(num // 100) + ' hundred'
59 |     else:
60 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
61 |   else:
62 |     return _inflect.number_to_words(num, andword='')
63 | 
64 | 
65 | def normalize_numbers(text):
66 |   text = re.sub(_comma_number_re, _remove_commas, text)
67 |   text = re.sub(_pounds_re, r'\1 pounds', text)
68 |   text = re.sub(_dollars_re, _expand_dollars, text)
69 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
70 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
71 |   text = re.sub(_number_re, _expand_number, text)
72 |   return text
73 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | from text import cmudict
 8 | 
 9 | _pad        = '_'
10 | _sos        = '^'
11 | _eos        = '~'
12 | _punctuations = " ,.'?!"
13 | _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
14 | 
15 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
16 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
17 | _arpabet = [ p for p in _arpabet if not p+'0' in _arpabet] 
18 | 
19 | # Export all symbols:
20 | #symbols = [_pad, _sos, _eos] + list(_punctuations) + list(_characters) + _arpabet
21 | symbols = [_pad] + list(_punctuations) + _arpabet


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os, argparse
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from modules.model import Model
  6 | import hparams as hp
  7 | from text import *
  8 | from utils.utils import *
  9 | from utils.plot_image import *
 10 | from apex import amp
 11 | 
 12 | 
 13 | def validate(model, val_loader, iteration, writer):
 14 |     model.eval()
 15 |     with torch.no_grad():
 16 |         n_data, val_recon_loss, val_kl_loss, val_duration_loss, val_align_loss = 0, 0, 0, 0, 0
 17 |         for i, batch in enumerate(val_loader):
 18 |             n_data += len(batch[0])
 19 |             text_padded, text_lengths, mel_padded, mel_lengths = [ x.cuda() for x in batch ]
 20 |             text_mask, mel_mask, diag_mask = model.prepare_mask(text_lengths, mel_lengths)
 21 | 
 22 |             ##### Text #####
 23 |             key, value = model.TextEnc(text_padded, text_mask)
 24 |             
 25 |             ##### Bottom_Up #####
 26 |             query=model.bottom_up(mel_padded, mel_mask)
 27 | 
 28 |             ##### Alignment #####
 29 |             h, align = model.get_align(query, key, value, text_lengths, mel_lengths, text_mask, mel_mask)
 30 | 
 31 |             ##### Top_Down #####
 32 |             mel_pred, kl_loss = model.top_down(h, mel_mask)
 33 | 
 34 |             ##### Compute Loss #####
 35 |             duration_out = model.get_duration(value, text_mask)
 36 |             recon_loss, duration_loss, align_loss = model.compute_loss(mel_pred,
 37 |                                                                        mel_padded,
 38 |                                                                        duration_out,
 39 |                                                                        align,
 40 |                                                                        mel_lengths,
 41 |                                                                        text_mask,
 42 |                                                                        mel_mask,
 43 |                                                                        diag_mask)
 44 |             
 45 |             val_recon_loss += recon_loss.item() * len(batch[0])
 46 |             val_kl_loss += kl_loss.item() * len(batch[0])
 47 |             val_duration_loss += duration_loss.item() * len(batch[0])
 48 |             val_align_loss += align_loss.item() * len(batch[0])
 49 |             
 50 |         val_recon_loss /= n_data
 51 |         val_kl_loss /= n_data
 52 |         val_duration_loss /= n_data
 53 |         val_align_loss /= n_data
 54 | 
 55 |     writer.add_scalar('val_recon_loss', val_recon_loss, global_step=iteration)
 56 |     writer.add_scalar('val_kl_loss', val_kl_loss, global_step=iteration)
 57 |     writer.add_scalar('val_duration_loss', val_duration_loss, global_step=iteration)
 58 |     writer.add_scalar('val_align_loss', val_align_loss, global_step=iteration)
 59 |     
 60 |     mel_plots, align_plots = plot_image(mel_padded,
 61 |                                         mel_pred,
 62 |                                         align,
 63 |                                         text_padded,
 64 |                                         mel_lengths,
 65 |                                         text_lengths)
 66 |     writer.add_figure('Validation mel_plots', mel_plots, global_step=iteration)
 67 |     writer.add_figure('Validation align_plots', align_plots, global_step=iteration)
 68 |     
 69 |     mel_out, durations = model.inference(text_padded[-1:, :text_lengths[-1]])
 70 |     align = torch.repeat_interleave(torch.eye(len(durations[0].cpu())).to(torch.long),
 71 |                                     durations[0].cpu(),
 72 |                                     dim=0).unsqueeze(0)
 73 |     mel_lengths[-1] = mel_out.size(2)
 74 |     mel_plots, align_plots = plot_image(torch.zeros_like(mel_padded),
 75 |                                         mel_out,
 76 |                                         align,
 77 |                                         text_padded,
 78 |                                         mel_lengths,
 79 |                                         text_lengths)
 80 |     writer.add_figure('Validation mel_plots_inference', mel_plots, global_step=iteration)
 81 |     writer.add_figure('Validation align_plots_inference', align_plots, global_step=iteration)
 82 |     model.train()
 83 | 
 84 | 
 85 | 
 86 | def main(args):
 87 |     train_loader, val_loader, collate_fn = prepare_dataloaders(hp)
 88 |     model = Model(hp).cuda()
 89 |     optimizer = torch.optim.Adamax(model.parameters(), lr=hp.lr)
 90 |     writer = get_writer(hp.output_directory, args.logdir)
 91 |     model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
 92 | 
 93 |     iteration = 0
 94 |     model.train()
 95 |     print(f"Training Start!!! ({args.logdir})")
 96 |     while iteration < (hp.train_steps):
 97 |         for i, batch in enumerate(train_loader):
 98 |             text_padded, text_lengths, mel_padded, mel_lengths = [ x.cuda() for x in batch ]
 99 |             recon_loss, kl_loss, duration_loss, align_loss = model(text_padded, mel_padded, text_lengths, mel_lengths)
100 | 
101 |             alpha=min(1, iteration/hp.kl_warmup_steps)
102 |             with amp.scale_loss((recon_loss + alpha*kl_loss + duration_loss + align_loss), optimizer) as scaled_loss:
103 |                 scaled_loss.backward()
104 | 
105 |             iteration += 1
106 |             lr_scheduling(optimizer, iteration)
107 |             nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh)
108 |             optimizer.step()
109 |             model.zero_grad()
110 |             writer.add_scalar('train_recon_loss', recon_loss, global_step=iteration)
111 |             writer.add_scalar('train_kl_loss', kl_loss, global_step=iteration)
112 |             writer.add_scalar('train_duration_loss', duration_loss, global_step=iteration)
113 |             writer.add_scalar('train_align_loss', align_loss, global_step=iteration)
114 | 
115 |             if iteration % (hp.iters_per_validation) == 0:
116 |                 validate(model, val_loader, iteration, writer)
117 | 
118 |             if iteration % (hp.iters_per_checkpoint) == 0:
119 |                 save_checkpoint(model, optimizer, hp.lr, iteration, filepath=f'{hp.output_directory}/{args.logdir}')
120 | 
121 |             if iteration == (hp.train_steps):
122 |                 break
123 |                 
124 |                 
125 | if __name__ == '__main__':
126 |     p = argparse.ArgumentParser()
127 |     p.add_argument('--gpu', type=str, default='0')
128 |     p.add_argument('-d', '--logdir', type=str, required=True)
129 |     args = p.parse_args()
130 |     
131 |     os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu
132 |     torch.manual_seed(hp.seed)
133 |     torch.cuda.manual_seed(hp.seed)
134 |     
135 |     main(args)
136 | 


--------------------------------------------------------------------------------
/utils/data_utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import hparams as hp
 4 | import torch
 5 | import torch.utils.data
 6 | import torch.nn.functional as F
 7 | import os
 8 | import pickle as pkl
 9 | from text import text_to_sequence
10 | 
11 | 
12 | def load_filepaths_and_text(metadata, split="|"):
13 |     with open(metadata, encoding='utf-8') as f:
14 |         filepaths_and_text = [line.strip().split(split) for line in f]
15 |     return filepaths_and_text
16 | 
17 | 
18 | class TextMelSet(torch.utils.data.Dataset):
19 |     def __init__(self, audiopaths_and_text, hp):
20 |         self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
21 |         self.data_type=hp.data_type
22 |         self.seq_list=[]
23 |         self.mel_list=[]
24 |         for f in self.audiopaths_and_text:
25 |             file_name = f[0][:10]
26 |             seq_path = os.path.join(hp.data_path, self.data_type)
27 |             mel_path = os.path.join(hp.data_path, 'melspectrogram')
28 |             self.seq_list.append(torch.from_numpy(np.load(f'{seq_path}/{file_name}_sequence.npy')))
29 |             self.mel_list.append(torch.from_numpy(np.load(f'{mel_path}/{file_name}_melspectrogram.npy')))
30 | 
31 |     def __getitem__(self, index):
32 |         return (self.seq_list[index], self.mel_list[index])
33 | 
34 |     def __len__(self):
35 |         return len(self.audiopaths_and_text)
36 | 
37 | 
38 | class TextMelCollate():
39 |     def __init__(self):
40 |         return
41 | 
42 |     def __call__(self, batch):
43 |         # Right zero-pad all one-hot text sequences to max input length
44 |         input_lengths=torch.LongTensor([len(x[0]) for x in batch])
45 |         max_input_len = input_lengths.max().item()
46 | 
47 |         text_padded = torch.zeros(len(batch), max_input_len, dtype=torch.long)
48 |         for i in range(len(batch)):
49 |             text = batch[i][0]
50 |             text_padded[i, :text.size(0)] = text
51 | 
52 |         # Right zero-pad melspectrogram
53 |         num_mels = batch[0][1].size(0)
54 |         max_target_len = max([x[1].size(1) for x in batch])
55 |         if max_target_len%hp.downsample_ratio != 0:
56 |             max_target_len = max_target_len - max_target_len%hp.downsample_ratio
57 | 
58 |         mel_padded = torch.zeros(len(batch), num_mels, max_target_len)
59 |         output_lengths = torch.LongTensor(len(batch))
60 |         for i in range(len(batch)):
61 |             mel = batch[i][1]
62 |             if mel.size(1)%hp.downsample_ratio!=0:
63 |                 mel=mel[:,:-(mel.size(1)%hp.downsample_ratio)]
64 |             mel_padded[i, :, :mel.size(1)] = mel
65 |             output_lengths[i] = mel.size(1)
66 | 
67 |         # Normalize
68 |         mel_padded = (torch.clamp(mel_padded, hp.min_db, hp.max_db)-hp.min_db) / (hp.max_db-hp.min_db)
69 |         
70 |         return text_padded, input_lengths, mel_padded, output_lengths
71 | 


--------------------------------------------------------------------------------
/utils/plot_image.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import torch.nn.functional as F
 3 | from text.symbols import symbols
 4 | import hparams as hp
 5 | 
 6 | 
 7 | # Mappings from symbol to numeric ID and vice versa:
 8 | symbol_to_id = {s: i for i, s in enumerate(symbols)}
 9 | id_to_symbol = {i: s for i, s in enumerate(symbols)}
10 | 
11 | 
12 | 
13 | def plot_image(target, melspec, alignments, text, mel_lengths, text_lengths):
14 |     # Draw mel_plots
15 |     mel_plots, axes = plt.subplots(2,1,figsize=(20,15))
16 |     L, T = text_lengths[-1], mel_lengths[-1]
17 | 
18 |     axes[0].imshow(target[-1].detach().cpu()[:,:T],
19 |                    origin='lower',
20 |                    aspect='auto')
21 | 
22 |     axes[1].imshow(melspec[-1].detach().cpu()[:,:T],
23 |                    origin='lower',
24 |                    aspect='auto')
25 | 
26 |     # Draw alignments
27 |     align_plots, axes = plt.subplots(2,1,figsize=(20,15))
28 |     alignments = alignments[-1].repeat_interleave(int(hp.downsample_ratio),0).t()
29 |     alignments = alignments.detach().cpu()[:L,:T]
30 |     
31 |     axes[0].imshow(alignments,
32 |                    origin='lower',
33 |                    aspect='auto')
34 | 
35 |     _, alignments = alignments.max(dim=0)
36 |     alignments = F.one_hot(alignments, L).t()
37 |     axes[1].imshow(alignments,
38 |                    origin='lower',
39 |                    aspect='auto')
40 |     
41 |     for i in range(2):
42 |         plt.sca(axes[i])
43 |         plt.xticks(range(T), [ f'{i}' if (i%10==0 or i==T-1) else '' for i in range(T) ])
44 |         plt.yticks(range(L), [id_to_symbol[c] for c in text[-1].detach().cpu().numpy()[:L]])
45 |         for yc in range(L):
46 |             plt.axhline(y=yc, c='r', linestyle='--', linewidth=0.5)
47 |     
48 |     return mel_plots, align_plots


--------------------------------------------------------------------------------
/utils/text2seq.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from g2p_en import G2p
 3 | from text import *
 4 | from text.cleaners import custom_english_cleaners
 5 | from text.symbols import symbols
 6 | 
 7 | # Mappings from symbol to numeric ID and vice versa:
 8 | symbol_to_id = {s: i for i, s in enumerate(symbols)}
 9 | id_to_symbol = {i: s for i, s in enumerate(symbols)}
10 | g2p = G2p()
11 | 
12 | 
13 | def text2symbols(text, dtype):
14 |     clean_char = custom_english_cleaners(text.rstrip()).rstrip()
15 |     if clean_char[-1] in ['.', ',']:
16 |         while clean_char[-1] in ['.', ',']:
17 |             clean_char = clean_char[:-1]
18 |         clean_char = clean_char + '.'
19 |     elif clean_char[-1] in ['!', '?']:
20 |         clean_char = clean_char
21 |     else:
22 |         clean_char = clean_char + '.'
23 |     
24 |     if dtype=='char':
25 |         return clean_char
26 | 
27 |     clean_phone = []
28 |     for s in g2p(clean_char.lower()):
29 |         if (s in [',', '!', '.', '?', "'"]) and (clean_phone[-1]==' '):
30 |             clean_phone.pop()
31 |             clean_phone.append(s)
32 | 
33 |         elif '@'+s in symbol_to_id:
34 |             clean_phone.append('@'+s)
35 | 
36 |         else:
37 |             clean_phone.append(s)
38 |     
39 |     return clean_phone
40 | 
41 | 
42 | def symbols2seq(symbols):
43 |     return np.asarray([symbol_to_id[s] for s in symbols], dtype=np.int64)
44 | 
45 | 
46 | def text2seq(text, dtype='phone'):
47 |     symbols = text2symbols(text, dtype)
48 |     return symbols2seq(symbols)


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | from .data_utils import TextMelSet, TextMelCollate
 5 | from torch.utils.tensorboard import SummaryWriter
 6 | import hparams as hp
 7 | 
 8 | 
 9 | def prepare_dataloaders(hp):
10 |     # Get data, data loaders and collate function ready
11 |     trainset = TextMelSet(hp.training_files, hp)
12 |     valset = TextMelSet(hp.validation_files, hp)
13 |     collate_fn = TextMelCollate()
14 | 
15 |     train_loader = DataLoader(trainset,
16 |                               num_workers=4,
17 |                               shuffle=True,
18 |                               batch_size=hp.batch_size, 
19 |                               drop_last=True, 
20 |                               collate_fn=collate_fn)
21 |     
22 |     val_loader = DataLoader(valset,
23 |                             num_workers=4,
24 |                             batch_size=hp.batch_size,
25 |                             collate_fn=collate_fn)
26 |     
27 |     return train_loader, val_loader, collate_fn
28 | 
29 | 
30 | def get_writer(output_directory, log_directory):
31 |     logging_path=f'{output_directory}/{log_directory}'
32 |     if not os.path.exists(logging_path):
33 |         os.mkdir(logging_path)
34 |     writer = SummaryWriter(logging_path)
35 |     return writer
36 | 
37 | 
38 | def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
39 |     print(f"Saving model and optimizer state at iteration {iteration} to {filepath}")
40 |     torch.save({'iteration': iteration,
41 |                 'state_dict': model.state_dict(),
42 |                 'optimizer': optimizer.state_dict(),
43 |                 'learning_rate': learning_rate}, f'{filepath}/checkpoint_{iteration}')
44 | 
45 |     
46 | def lr_scheduling(opt, step, init_lr=hp.lr, warmup_steps=hp.lr_warmup_steps):
47 |     opt.param_groups[0]['lr'] = init_lr * warmup_steps**0.5 * min(step ** -0.5, step * warmup_steps ** -1.5)
48 |     return
49 | 
50 | 
51 | def get_mask_from_lengths(lengths):
52 |     max_len = torch.max(lengths).item()
53 |     ids = lengths.new_tensor(torch.arange(0, max_len, device=lengths.device))
54 |     mask = (lengths.unsqueeze(1) <= ids).to(torch.bool)
55 |     return mask.detach()
56 | 
57 | 
58 | def count_parameters(model):
59 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
60 | 
61 | 
62 | def PositionalEncoding(d_model, lengths, w_s=None):
63 |     L = int(lengths.max().item())
64 |     if w_s is None:
65 |         position = torch.arange(0, L, dtype=torch.float).unsqueeze(0).to(lengths.device)
66 |     else:
67 |         position = torch.arange(0, L, dtype=torch.float).unsqueeze(0).to(lengths.device) * w_s.unsqueeze(-1)
68 |     div_term = torch.pow(10000, torch.arange(0, d_model, 2).float() / d_model).to(lengths.device)
69 |     pe = torch.zeros(len(lengths), L, d_model).to(lengths.device)
70 |     
71 |     pe[:, :, 0::2] = torch.sin(position.unsqueeze(-1) / div_term.unsqueeze(0))
72 |     pe[:, :, 1::2] = torch.cos(position.unsqueeze(-1) / div_term.unsqueeze(0))
73 |     return pe
74 | 


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_10.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_10.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_11.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_11.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_12.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_12.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_20.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_20.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_21.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_21.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_22.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_22.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_30.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_30.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_31.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_31.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_32.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_32.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_40.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_40.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_41.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_41.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_42.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/HelloMyFriends5.0_42.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_10.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_10.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_11.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_11.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_12.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_12.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_20.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_20.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_21.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_21.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_22.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_22.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_30.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_30.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_31.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_31.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_32.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_32.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_40.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_40.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_41.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_41.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_42.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/OneTwoThree5.0_42.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_10.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_10.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_11.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_11.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_12.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_12.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_20.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_20.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_21.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_21.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_22.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_22.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_30.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_30.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_31.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_31.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_32.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_32.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_40.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_40.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_41.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_41.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_42.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_hierarchy/TrickOrTreat5.0_42.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_speed control/Concatenated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_speed control/Concatenated.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_speed control/Emphasis on one.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_speed control/Emphasis on one.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_speed control/Emphasis on three.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_speed control/Emphasis on three.wav


--------------------------------------------------------------------------------
/wav_samples/Ablation study_speed control/Emphasis on two.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/Ablation study_speed control/Emphasis on two.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ002-0253.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ002-0253.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ002-0260.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ002-0260.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ008-0121.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ008-0121.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ011-0141.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ011-0141.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ015-0194.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ015-0194.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ023-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ023-0016.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ028-0145.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ028-0145.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ028-0349.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ028-0349.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ031-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ031-0014.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/0.GT_wav/LJ046-0191.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/0.GT_wav/LJ046-0191.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ002-0253.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ002-0253.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ002-0260.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ002-0260.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ008-0121.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ008-0121.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ011-0141.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ011-0141.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ015-0194.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ015-0194.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ023-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ023-0016.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ028-0145.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ028-0145.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ028-0349.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ028-0349.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ031-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ031-0014.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/1.GT_mel/LJ046-0191.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/1.GT_mel/LJ046-0191.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ002-0253.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ002-0253.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ002-0260.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ002-0260.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ008-0121.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ008-0121.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ011-0141.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ011-0141.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ015-0194.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ015-0194.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ023-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ023-0016.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ028-0145.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ028-0145.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ028-0349.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ028-0349.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ031-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ031-0014.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/2.Tacotron 2/LJ046-0191.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/2.Tacotron 2/LJ046-0191.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ002-0253.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ002-0253.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ002-0260.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ002-0260.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ008-0121.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ008-0121.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ011-0141.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ011-0141.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ015-0194.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ015-0194.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ023-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ023-0016.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ028-0145.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ028-0145.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ028-0349.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ028-0349.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ031-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ031-0014.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/3.Glow-TTS/LJ046-0191.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/3.Glow-TTS/LJ046-0191.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ002-0253.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ002-0253.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ002-0260.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ002-0260.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ008-0121.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ008-0121.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ011-0141.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ011-0141.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ015-0194.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ015-0194.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ023-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ023-0016.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ028-0145.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ028-0145.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ028-0349.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ028-0349.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ031-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ031-0014.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/4.BVAE-TTS/LJ046-0191.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/4.BVAE-TTS/LJ046-0191.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ002-0253.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ002-0253.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ002-0260.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ002-0260.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ008-0121.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ008-0121.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ011-0141.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ011-0141.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ015-0194.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ015-0194.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ023-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ023-0016.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ028-0145.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ028-0145.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ028-0349.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ028-0349.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ031-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ031-0014.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ046-0191.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/wav_samples/MOS-ID/5.BVAE-TTS_nojitter/LJ046-0191.wav


--------------------------------------------------------------------------------
/wav_samples/MOS-ID/samples.txt:
--------------------------------------------------------------------------------
 1 | LJ028-0349|who were each required to send so large a number to Babylon, that in all there were collected no fewer than fifty thousand.
 2 | LJ002-0260|Yet the public opinion of the whole body seems to have checked dissipation.
 3 | LJ031-0014|the Presidential limousine arrived at the emergency entrance of the Parkland Hospital at about twelve:thirty-five p.m.
 4 | LJ046-0191|it had established periodic regular review of the status of four hundred individuals;
 5 | LJ002-0253|were governed by rules which they themselves had framed, and under which subscriptions were levied
 6 | LJ008-0121|After the construction and action of the machine had been explained, the doctor asked the governor what kind of men he had commanded at Goree,
 7 | LJ028-0145|And here I may not omit to tell the use to which the mould dug out of the great moat was turned, nor the manner wherein the wall was wrought.
 8 | LJ015-0194|and behaved so as to justify a belief that he had been a jail-bird all his life.
 9 | LJ023-0016|In nineteen thirty-three you and I knew that we must never let our economic system get completely out of joint again
10 | LJ011-0141|There were at the moment in Newgate six convicts sentenced to death for forging wills.


--------------------------------------------------------------------------------
/waveglow/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, NVIDIA Corporation
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/waveglow/README.md:
--------------------------------------------------------------------------------
 1 | ![WaveGlow](waveglow_logo.png "WaveGLow")
 2 | 
 3 | ## WaveGlow: a Flow-based Generative Network for Speech Synthesis
 4 | 
 5 | ### Ryan Prenger, Rafael Valle, and Bryan Catanzaro
 6 | 
 7 | In our recent [paper], we propose WaveGlow: a flow-based network capable of
 8 | generating high quality speech from mel-spectrograms. WaveGlow combines insights
 9 | from [Glow] and [WaveNet] in order to provide fast, efficient and high-quality
10 | audio synthesis, without the need for auto-regression. WaveGlow is implemented
11 | using only a single network, trained using only a single cost function:
12 | maximizing the likelihood of the training data, which makes the training
13 | procedure simple and stable.
14 | 
15 | Our [PyTorch] implementation produces audio samples at a rate of 2750
16 | kHz on an NVIDIA V100 GPU. Mean Opinion Scores show that it delivers audio
17 | quality as good as the best publicly available WaveNet implementation.
18 | 
19 | Visit our [website] for audio samples.
20 | 
21 | ## Setup
22 | 
23 | 1. Clone our repo and initialize submodule
24 | 
25 |    ```command
26 |    git clone https://github.com/NVIDIA/waveglow.git
27 |    cd waveglow
28 |    git submodule init
29 |    git submodule update
30 |    ```
31 | 
32 | 2. Install requirements `pip3 install -r requirements.txt`
33 | 
34 | 3. Install [Apex]
35 | 
36 | 
37 | ## Generate audio with our pre-existing model
38 | 
39 | 1. Download our [published model]
40 | 2. Download [mel-spectrograms]
41 | 3. Generate audio `python3 inference.py -f <(ls mel_spectrograms/*.pt) -w waveglow_256channels.pt -o . --is_fp16 -s 0.6`  
42 | 
43 | N.b. use `convert_model.py` to convert your older models to the current model
44 | with fused residual and skip connections.
45 | 
46 | ## Train your own model
47 | 
48 | 1. Download [LJ Speech Data]. In this example it's in `data/`
49 | 
50 | 2. Make a list of the file names to use for training/testing
51 | 
52 |    ```command
53 |    ls data/*.wav | tail -n+10 > train_files.txt
54 |    ls data/*.wav | head -n10 > test_files.txt
55 |    ```
56 | 
57 | 3. Train your WaveGlow networks
58 | 
59 |    ```command
60 |    mkdir checkpoints
61 |    python train.py -c config.json
62 |    ```
63 | 
64 |    For multi-GPU training replace `train.py` with `distributed.py`.  Only tested with single node and NCCL.
65 | 
66 |    For mixed precision training set `"fp16_run": true` on `config.json`.
67 | 
68 | 4. Make test set mel-spectrograms
69 | 
70 |    `python mel2samp.py -f test_files.txt -o . -c config.json`
71 | 
72 | 5. Do inference with your network
73 | 
74 |    ```command
75 |    ls *.pt > mel_files.txt
76 |    python3 inference.py -f mel_files.txt -w checkpoints/waveglow_10000 -o . --is_fp16 -s 0.6
77 |    ```
78 | 
79 | [//]: # (TODO)
80 | [//]: # (PROVIDE INSTRUCTIONS FOR DOWNLOADING LJS)
81 | [pytorch 1.0]: https://github.com/pytorch/pytorch#installation
82 | [website]: https://nv-adlr.github.io/WaveGlow
83 | [paper]: https://arxiv.org/abs/1811.00002
84 | [WaveNet implementation]: https://github.com/r9y9/wavenet_vocoder
85 | [Glow]: https://blog.openai.com/glow/
86 | [WaveNet]: https://deepmind.com/blog/wavenet-generative-model-raw-audio/
87 | [PyTorch]: http://pytorch.org
88 | [published model]: https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing
89 | [mel-spectrograms]: https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view?usp=sharing
90 | [LJ Speech Data]: https://keithito.com/LJ-Speech-Dataset
91 | [Apex]: https://github.com/nvidia/apex
92 | 


--------------------------------------------------------------------------------
/waveglow/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_config": {
 3 |         "fp16_run": true,
 4 |         "output_directory": "checkpoints",
 5 |         "epochs": 100000,
 6 |         "learning_rate": 1e-4,
 7 |         "sigma": 1.0,
 8 |         "iters_per_checkpoint": 2000,
 9 |         "batch_size": 12,
10 |         "seed": 1234,
11 |         "checkpoint_path": "",
12 |         "with_tensorboard": false
13 |     },
14 |     "data_config": {
15 |         "training_files": "train_files.txt",
16 |         "segment_length": 16000,
17 |         "sampling_rate": 22050,
18 |         "filter_length": 1024,
19 |         "hop_length": 256,
20 |         "win_length": 1024,
21 |         "mel_fmin": 0.0,
22 |         "mel_fmax": 8000.0
23 |     },
24 |     "dist_config": {
25 |         "dist_backend": "nccl",
26 |         "dist_url": "tcp://localhost:54321"
27 |     },
28 | 
29 |     "waveglow_config": {
30 |         "n_mel_channels": 80,
31 |         "n_flows": 12,
32 |         "n_group": 8,
33 |         "n_early_every": 4,
34 |         "n_early_size": 2,
35 |         "WN_config": {
36 |             "n_layers": 8,
37 |             "n_channels": 256,
38 |             "kernel_size": 3
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/waveglow/convert_model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import copy
 3 | import torch
 4 | 
 5 | def _check_model_old_version(model):
 6 |     if hasattr(model.WN[0], 'res_layers'):
 7 |         return True
 8 |     else:
 9 |         return False
10 | 
11 | def update_model(old_model):
12 |     if not _check_model_old_version(old_model):
13 |         return old_model
14 |     new_model = copy.deepcopy(old_model)
15 |     for idx in range(0, len(new_model.WN)):
16 |         wavenet = new_model.WN[idx]
17 |         wavenet.res_skip_layers = torch.nn.ModuleList()
18 |         n_channels = wavenet.n_channels
19 |         n_layers = wavenet.n_layers
20 |         for i in range(0, n_layers):
21 |             if i < n_layers - 1:
22 |                 res_skip_channels = 2*n_channels
23 |             else:
24 |                 res_skip_channels = n_channels
25 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
26 |             skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
27 |             if i < n_layers - 1:
28 |                 res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
29 |                 res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
30 |                 res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
31 |             else:
32 |                 res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
33 |                 res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
34 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
35 |             wavenet.res_skip_layers.append(res_skip_layer)
36 |         del wavenet.res_layers
37 |         del wavenet.skip_layers
38 |     return new_model
39 | 
40 | if __name__ == '__main__':
41 |     old_model_path = sys.argv[1]
42 |     new_model_path = sys.argv[2]
43 |     model = torch.load(old_model_path)
44 |     model['model'] = update_model(model['model'])
45 |     torch.save(model, new_model_path)
46 |     
47 | 


--------------------------------------------------------------------------------
/waveglow/denoiser.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('tacotron2')
 3 | import torch
 4 | from layers import STFT
 5 | 
 6 | 
 7 | class Denoiser(torch.nn.Module):
 8 |     """ Removes model bias from audio produced with waveglow """
 9 | 
10 |     def __init__(self, waveglow, filter_length=1024, n_overlap=4,
11 |                  win_length=1024, mode='zeros'):
12 |         super(Denoiser, self).__init__()
13 |         self.stft = STFT(filter_length=filter_length,
14 |                          hop_length=int(filter_length/n_overlap),
15 |                          win_length=win_length).cuda()
16 |         if mode == 'zeros':
17 |             mel_input = torch.zeros(
18 |                 (1, 80, 88),
19 |                 dtype=waveglow.upsample.weight.dtype,
20 |                 device=waveglow.upsample.weight.device)
21 |         elif mode == 'normal':
22 |             mel_input = torch.randn(
23 |                 (1, 80, 88),
24 |                 dtype=waveglow.upsample.weight.dtype,
25 |                 device=waveglow.upsample.weight.device)
26 |         else:
27 |             raise Exception("Mode {} if not supported".format(mode))
28 | 
29 |         with torch.no_grad():
30 |             bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
31 |             bias_spec, _ = self.stft.transform(bias_audio)
32 | 
33 |         self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
34 | 
35 |     def forward(self, audio, strength=0.1):
36 |         audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
37 |         audio_spec_denoised = audio_spec - self.bias_spec * strength
38 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
39 |         audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
40 |         return audio_denoised
41 | 


--------------------------------------------------------------------------------
/waveglow/distributed.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import os
 28 | import sys
 29 | import time
 30 | import subprocess
 31 | import argparse
 32 | 
 33 | import torch
 34 | import torch.distributed as dist
 35 | from torch.autograd import Variable
 36 | 
 37 | def reduce_tensor(tensor, num_gpus):
 38 |     rt = tensor.clone()
 39 |     dist.all_reduce(rt, op=dist.reduce_op.SUM)
 40 |     rt /= num_gpus
 41 |     return rt
 42 | 
 43 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
 44 |     assert torch.cuda.is_available(), "Distributed mode requires CUDA."
 45 |     print("Initializing Distributed")
 46 | 
 47 |     # Set cuda device so everything is done on the right GPU.
 48 |     torch.cuda.set_device(rank % torch.cuda.device_count())
 49 | 
 50 |     # Initialize distributed communication
 51 |     dist.init_process_group(dist_backend, init_method=dist_url,
 52 |                             world_size=num_gpus, rank=rank,
 53 |                             group_name=group_name)
 54 | 
 55 | def _flatten_dense_tensors(tensors):
 56 |     """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
 57 |     same dense type.
 58 |     Since inputs are dense, the resulting tensor will be a concatenated 1D
 59 |     buffer. Element-wise operation on this buffer will be equivalent to
 60 |     operating individually.
 61 |     Arguments:
 62 |         tensors (Iterable[Tensor]): dense tensors to flatten.
 63 |     Returns:
 64 |         A contiguous 1D buffer containing input tensors.
 65 |     """
 66 |     if len(tensors) == 1:
 67 |         return tensors[0].contiguous().view(-1)
 68 |     flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
 69 |     return flat
 70 | 
 71 | def _unflatten_dense_tensors(flat, tensors):
 72 |     """View a flat buffer using the sizes of tensors. Assume that tensors are of
 73 |     same dense type, and that flat is given by _flatten_dense_tensors.
 74 |     Arguments:
 75 |         flat (Tensor): flattened dense tensors to unflatten.
 76 |         tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
 77 |           unflatten flat.
 78 |     Returns:
 79 |         Unflattened dense tensors with sizes same as tensors and values from
 80 |         flat.
 81 |     """
 82 |     outputs = []
 83 |     offset = 0
 84 |     for tensor in tensors:
 85 |         numel = tensor.numel()
 86 |         outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
 87 |         offset += numel
 88 |     return tuple(outputs)
 89 | 
 90 | def apply_gradient_allreduce(module):
 91 |     """
 92 |     Modifies existing model to do gradient allreduce, but doesn't change class
 93 |     so you don't need "module"
 94 |     """
 95 |     if not hasattr(dist, '_backend'):
 96 |         module.warn_on_half = True
 97 |     else:
 98 |         module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
 99 | 
100 |     for p in module.state_dict().values():
101 |         if not torch.is_tensor(p):
102 |             continue
103 |         dist.broadcast(p, 0)
104 | 
105 |     def allreduce_params():
106 |         if(module.needs_reduction):
107 |             module.needs_reduction = False
108 |             buckets = {}
109 |             for param in module.parameters():
110 |                 if param.requires_grad and param.grad is not None:
111 |                     tp = type(param.data)
112 |                     if tp not in buckets:
113 |                         buckets[tp] = []
114 |                     buckets[tp].append(param)
115 |             if module.warn_on_half:
116 |                 if torch.cuda.HalfTensor in buckets:
117 |                     print("WARNING: gloo dist backend for half parameters may be extremely slow." +
118 |                           " It is recommended to use the NCCL backend in this case. This currently requires" +
119 |                           "PyTorch built from top of tree master.")
120 |                     module.warn_on_half = False
121 | 
122 |             for tp in buckets:
123 |                 bucket = buckets[tp]
124 |                 grads = [param.grad.data for param in bucket]
125 |                 coalesced = _flatten_dense_tensors(grads)
126 |                 dist.all_reduce(coalesced)
127 |                 coalesced /= dist.get_world_size()
128 |                 for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
129 |                     buf.copy_(synced)
130 | 
131 |     for param in list(module.parameters()):
132 |         def allreduce_hook(*unused):
133 |             Variable._execution_engine.queue_callback(allreduce_params)
134 |         if param.requires_grad:
135 |             param.register_hook(allreduce_hook)
136 |             dir(param)
137 | 
138 |     def set_needs_reduction(self, input, output):
139 |         self.needs_reduction = True
140 | 
141 |     module.register_forward_hook(set_needs_reduction)
142 |     return module
143 | 
144 | 
145 | def main(config, stdout_dir, args_str):
146 |     args_list = ['train.py']
147 |     args_list += args_str.split(' ') if len(args_str) > 0 else []
148 | 
149 |     args_list.append('--config={}'.format(config))
150 | 
151 |     num_gpus = torch.cuda.device_count()
152 |     args_list.append('--num_gpus={}'.format(num_gpus))
153 |     args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
154 | 
155 |     if not os.path.isdir(stdout_dir):
156 |         os.makedirs(stdout_dir)
157 |         os.chmod(stdout_dir, 0o775)
158 | 
159 |     workers = []
160 | 
161 |     for i in range(num_gpus):
162 |         args_list[-2] = '--rank={}'.format(i)
163 |         stdout = None if i == 0 else open(
164 |             os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
165 |         print(args_list)
166 |         p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
167 |         workers.append(p)
168 | 
169 |     for p in workers:
170 |         p.wait()
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     parser = argparse.ArgumentParser()
175 |     parser.add_argument('-c', '--config', type=str, required=True,
176 |                         help='JSON file for configuration')
177 |     parser.add_argument('-s', '--stdout_dir', type=str, default=".",
178 |                         help='directory to save stoud logs')
179 |     parser.add_argument(
180 |         '-a', '--args_str', type=str, default='',
181 |         help='double quoted string with space separated key value pairs')
182 | 
183 |     args = parser.parse_args()
184 |     main(args.config, args.stdout_dir, args.args_str)
185 | 


--------------------------------------------------------------------------------
/waveglow/glow.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import copy
 28 | import torch
 29 | from torch.autograd import Variable
 30 | import torch.nn.functional as F
 31 | 
 32 | 
 33 | @torch.jit.script
 34 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
 35 |     n_channels_int = n_channels[0]
 36 |     in_act = input_a+input_b
 37 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
 38 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 39 |     acts = t_act * s_act
 40 |     return acts
 41 | 
 42 | 
 43 | class WaveGlowLoss(torch.nn.Module):
 44 |     def __init__(self, sigma=1.0):
 45 |         super(WaveGlowLoss, self).__init__()
 46 |         self.sigma = sigma
 47 | 
 48 |     def forward(self, model_output):
 49 |         z, log_s_list, log_det_W_list = model_output
 50 |         for i, log_s in enumerate(log_s_list):
 51 |             if i == 0:
 52 |                 log_s_total = torch.sum(log_s)
 53 |                 log_det_W_total = log_det_W_list[i]
 54 |             else:
 55 |                 log_s_total = log_s_total + torch.sum(log_s)
 56 |                 log_det_W_total += log_det_W_list[i]
 57 | 
 58 |         loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
 59 |         return loss/(z.size(0)*z.size(1)*z.size(2))
 60 | 
 61 | 
 62 | class Invertible1x1Conv(torch.nn.Module):
 63 |     """
 64 |     The layer outputs both the convolution, and the log determinant
 65 |     of its weight matrix.  If reverse=True it does convolution with
 66 |     inverse
 67 |     """
 68 |     def __init__(self, c):
 69 |         super(Invertible1x1Conv, self).__init__()
 70 |         self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
 71 |                                     bias=False)
 72 | 
 73 |         # Sample a random orthonormal matrix to initialize weights
 74 |         W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
 75 | 
 76 |         # Ensure determinant is 1.0 not -1.0
 77 |         if torch.det(W) < 0:
 78 |             W[:,0] = -1*W[:,0]
 79 |         W = W.view(c, c, 1)
 80 |         self.conv.weight.data = W
 81 | 
 82 |     def forward(self, z, reverse=False):
 83 |         # shape
 84 |         batch_size, group_size, n_of_groups = z.size()
 85 | 
 86 |         W = self.conv.weight.squeeze()
 87 | 
 88 |         if reverse:
 89 |             if not hasattr(self, 'W_inverse'):
 90 |                 # Reverse computation
 91 |                 W_inverse = W.float().inverse()
 92 |                 W_inverse = Variable(W_inverse[..., None])
 93 |                 if z.type() == 'torch.cuda.HalfTensor':
 94 |                     W_inverse = W_inverse.half()
 95 |                 self.W_inverse = W_inverse
 96 |             z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
 97 |             return z
 98 |         else:
 99 |             # Forward computation
100 |             log_det_W = batch_size * n_of_groups * torch.logdet(W)
101 |             z = self.conv(z)
102 |             return z, log_det_W
103 | 
104 | 
105 | class WN(torch.nn.Module):
106 |     """
107 |     This is the WaveNet like layer for the affine coupling.  The primary difference
108 |     from WaveNet is the convolutions need not be causal.  There is also no dilation
109 |     size reset.  The dilation only doubles on each layer
110 |     """
111 |     def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
112 |                  kernel_size):
113 |         super(WN, self).__init__()
114 |         assert(kernel_size % 2 == 1)
115 |         assert(n_channels % 2 == 0)
116 |         self.n_layers = n_layers
117 |         self.n_channels = n_channels
118 |         self.in_layers = torch.nn.ModuleList()
119 |         self.res_skip_layers = torch.nn.ModuleList()
120 |         self.cond_layers = torch.nn.ModuleList()
121 | 
122 |         start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
123 |         start = torch.nn.utils.weight_norm(start, name='weight')
124 |         self.start = start
125 | 
126 |         # Initializing last layer to 0 makes the affine coupling layers
127 |         # do nothing at first.  This helps with training stability
128 |         end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
129 |         end.weight.data.zero_()
130 |         end.bias.data.zero_()
131 |         self.end = end
132 | 
133 |         for i in range(n_layers):
134 |             dilation = 2 ** i
135 |             padding = int((kernel_size*dilation - dilation)/2)
136 |             in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
137 |                                        dilation=dilation, padding=padding)
138 |             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
139 |             self.in_layers.append(in_layer)
140 | 
141 |             cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
142 |             cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
143 |             self.cond_layers.append(cond_layer)
144 | 
145 |             # last one is not necessary
146 |             if i < n_layers - 1:
147 |                 res_skip_channels = 2*n_channels
148 |             else:
149 |                 res_skip_channels = n_channels
150 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
151 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
152 |             self.res_skip_layers.append(res_skip_layer)
153 | 
154 |     def forward(self, forward_input):
155 |         audio, spect = forward_input
156 |         audio = self.start(audio)
157 | 
158 |         for i in range(self.n_layers):
159 |             acts = fused_add_tanh_sigmoid_multiply(
160 |                 self.in_layers[i](audio),
161 |                 self.cond_layers[i](spect),
162 |                 torch.IntTensor([self.n_channels]))
163 | 
164 |             res_skip_acts = self.res_skip_layers[i](acts)
165 |             if i < self.n_layers - 1:
166 |                 audio = res_skip_acts[:,:self.n_channels,:] + audio
167 |                 skip_acts = res_skip_acts[:,self.n_channels:,:]
168 |             else:
169 |                 skip_acts = res_skip_acts
170 | 
171 |             if i == 0:
172 |                 output = skip_acts
173 |             else:
174 |                 output = skip_acts + output
175 |         return self.end(output)
176 | 
177 | 
178 | class WaveGlow(torch.nn.Module):
179 |     def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
180 |                  n_early_size, WN_config):
181 |         super(WaveGlow, self).__init__()
182 | 
183 |         self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
184 |                                                  n_mel_channels,
185 |                                                  1024, stride=256)
186 |         assert(n_group % 2 == 0)
187 |         self.n_flows = n_flows
188 |         self.n_group = n_group
189 |         self.n_early_every = n_early_every
190 |         self.n_early_size = n_early_size
191 |         self.WN = torch.nn.ModuleList()
192 |         self.convinv = torch.nn.ModuleList()
193 | 
194 |         n_half = int(n_group/2)
195 | 
196 |         # Set up layers with the right sizes based on how many dimensions
197 |         # have been output already
198 |         n_remaining_channels = n_group
199 |         for k in range(n_flows):
200 |             if k % self.n_early_every == 0 and k > 0:
201 |                 n_half = n_half - int(self.n_early_size/2)
202 |                 n_remaining_channels = n_remaining_channels - self.n_early_size
203 |             self.convinv.append(Invertible1x1Conv(n_remaining_channels))
204 |             self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
205 |         self.n_remaining_channels = n_remaining_channels  # Useful during inference
206 | 
207 |     def forward(self, forward_input):
208 |         """
209 |         forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
210 |         forward_input[1] = audio: batch x time
211 |         """
212 |         spect, audio = forward_input
213 | 
214 |         #  Upsample spectrogram to size of audio
215 |         spect = self.upsample(spect)
216 |         assert(spect.size(2) >= audio.size(1))
217 |         if spect.size(2) > audio.size(1):
218 |             spect = spect[:, :, :audio.size(1)]
219 | 
220 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
221 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
222 | 
223 |         audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
224 |         output_audio = []
225 |         log_s_list = []
226 |         log_det_W_list = []
227 | 
228 |         for k in range(self.n_flows):
229 |             if k % self.n_early_every == 0 and k > 0:
230 |                 output_audio.append(audio[:,:self.n_early_size,:])
231 |                 audio = audio[:,self.n_early_size:,:]
232 | 
233 |             audio, log_det_W = self.convinv[k](audio)
234 |             log_det_W_list.append(log_det_W)
235 | 
236 |             n_half = int(audio.size(1)/2)
237 |             audio_0 = audio[:,:n_half,:]
238 |             audio_1 = audio[:,n_half:,:]
239 | 
240 |             output = self.WN[k]((audio_0, spect))
241 |             log_s = output[:, n_half:, :]
242 |             b = output[:, :n_half, :]
243 |             audio_1 = torch.exp(log_s)*audio_1 + b
244 |             log_s_list.append(log_s)
245 | 
246 |             audio = torch.cat([audio_0, audio_1],1)
247 | 
248 |         output_audio.append(audio)
249 |         return torch.cat(output_audio,1), log_s_list, log_det_W_list
250 | 
251 |     def infer(self, spect, sigma=1.0):
252 |         spect = self.upsample(spect)
253 |         # trim conv artifacts. maybe pad spec to kernel multiple
254 |         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
255 |         spect = spect[:, :, :-time_cutoff]
256 | 
257 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
258 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
259 | 
260 |         if spect.type() == 'torch.cuda.HalfTensor':
261 |             audio = torch.cuda.HalfTensor(spect.size(0),
262 |                                           self.n_remaining_channels,
263 |                                           spect.size(2)).normal_()
264 |         else:
265 |             audio = torch.cuda.FloatTensor(spect.size(0),
266 |                                            self.n_remaining_channels,
267 |                                            spect.size(2)).normal_()
268 | 
269 |         audio = torch.autograd.Variable(sigma*audio)
270 | 
271 |         for k in reversed(range(self.n_flows)):
272 |             n_half = int(audio.size(1)/2)
273 |             audio_0 = audio[:,:n_half,:]
274 |             audio_1 = audio[:,n_half:,:]
275 | 
276 |             output = self.WN[k]((audio_0, spect))
277 |             s = output[:, n_half:, :]
278 |             b = output[:, :n_half, :]
279 |             audio_1 = (audio_1 - b)/torch.exp(s)
280 |             audio = torch.cat([audio_0, audio_1],1)
281 | 
282 |             audio = self.convinv[k](audio, reverse=True)
283 | 
284 |             if k % self.n_early_every == 0 and k > 0:
285 |                 if spect.type() == 'torch.cuda.HalfTensor':
286 |                     z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
287 |                 else:
288 |                     z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
289 |                 audio = torch.cat((sigma*z, audio),1)
290 | 
291 |         audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
292 |         return audio
293 | 
294 |     @staticmethod
295 |     def remove_weightnorm(model):
296 |         waveglow = model
297 |         for WN in waveglow.WN:
298 |             WN.start = torch.nn.utils.remove_weight_norm(WN.start)
299 |             WN.in_layers = remove(WN.in_layers)
300 |             WN.cond_layers = remove(WN.cond_layers)
301 |             WN.res_skip_layers = remove(WN.res_skip_layers)
302 |         return waveglow
303 | 
304 | 
305 | def remove(conv_list):
306 |     new_conv_list = torch.nn.ModuleList()
307 |     for old_conv in conv_list:
308 |         old_conv = torch.nn.utils.remove_weight_norm(old_conv)
309 |         new_conv_list.append(old_conv)
310 |     return new_conv_list
311 | 


--------------------------------------------------------------------------------
/waveglow/glow_old.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | from glow import Invertible1x1Conv, remove
  4 | 
  5 | 
  6 | @torch.jit.script
  7 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
  8 |     n_channels_int = n_channels[0]
  9 |     in_act = input_a+input_b
 10 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
 11 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 12 |     acts = t_act * s_act
 13 |     return acts
 14 | 
 15 | 
 16 | class WN(torch.nn.Module):
 17 |     """
 18 |     This is the WaveNet like layer for the affine coupling.  The primary difference
 19 |     from WaveNet is the convolutions need not be causal.  There is also no dilation
 20 |     size reset.  The dilation only doubles on each layer
 21 |     """
 22 |     def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
 23 |                  kernel_size):
 24 |         super(WN, self).__init__()
 25 |         assert(kernel_size % 2 == 1)
 26 |         assert(n_channels % 2 == 0)
 27 |         self.n_layers = n_layers
 28 |         self.n_channels = n_channels
 29 |         self.in_layers = torch.nn.ModuleList()
 30 |         self.res_skip_layers = torch.nn.ModuleList()
 31 |         self.cond_layers = torch.nn.ModuleList()
 32 | 
 33 |         start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
 34 |         start = torch.nn.utils.weight_norm(start, name='weight')
 35 |         self.start = start
 36 | 
 37 |         # Initializing last layer to 0 makes the affine coupling layers
 38 |         # do nothing at first.  This helps with training stability
 39 |         end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
 40 |         end.weight.data.zero_()
 41 |         end.bias.data.zero_()
 42 |         self.end = end
 43 | 
 44 |         for i in range(n_layers):
 45 |             dilation = 2 ** i
 46 |             padding = int((kernel_size*dilation - dilation)/2)
 47 |             in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
 48 |                                        dilation=dilation, padding=padding)
 49 |             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
 50 |             self.in_layers.append(in_layer)
 51 | 
 52 |             cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
 53 |             cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
 54 |             self.cond_layers.append(cond_layer)
 55 | 
 56 |             # last one is not necessary
 57 |             if i < n_layers - 1:
 58 |                 res_skip_channels = 2*n_channels
 59 |             else:
 60 |                 res_skip_channels = n_channels
 61 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
 62 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
 63 |             self.res_skip_layers.append(res_skip_layer)
 64 | 
 65 |     def forward(self, forward_input):
 66 |         audio, spect = forward_input
 67 |         audio = self.start(audio)
 68 | 
 69 |         for i in range(self.n_layers):
 70 |             acts = fused_add_tanh_sigmoid_multiply(
 71 |                 self.in_layers[i](audio),
 72 |                 self.cond_layers[i](spect),
 73 |                 torch.IntTensor([self.n_channels]))
 74 | 
 75 |             res_skip_acts = self.res_skip_layers[i](acts)
 76 |             if i < self.n_layers - 1:
 77 |                 audio = res_skip_acts[:,:self.n_channels,:] + audio
 78 |                 skip_acts = res_skip_acts[:,self.n_channels:,:]
 79 |             else:
 80 |                 skip_acts = res_skip_acts
 81 | 
 82 |             if i == 0:
 83 |                 output = skip_acts
 84 |             else:
 85 |                 output = skip_acts + output
 86 |         return self.end(output)
 87 | 
 88 | 
 89 | class WaveGlow(torch.nn.Module):
 90 |     def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
 91 |                  n_early_size, WN_config):
 92 |         super(WaveGlow, self).__init__()
 93 | 
 94 |         self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
 95 |                                                  n_mel_channels,
 96 |                                                  1024, stride=256)
 97 |         assert(n_group % 2 == 0)
 98 |         self.n_flows = n_flows
 99 |         self.n_group = n_group
100 |         self.n_early_every = n_early_every
101 |         self.n_early_size = n_early_size
102 |         self.WN = torch.nn.ModuleList()
103 |         self.convinv = torch.nn.ModuleList()
104 | 
105 |         n_half = int(n_group/2)
106 | 
107 |         # Set up layers with the right sizes based on how many dimensions
108 |         # have been output already
109 |         n_remaining_channels = n_group
110 |         for k in range(n_flows):
111 |             if k % self.n_early_every == 0 and k > 0:
112 |                 n_half = n_half - int(self.n_early_size/2)
113 |                 n_remaining_channels = n_remaining_channels - self.n_early_size
114 |             self.convinv.append(Invertible1x1Conv(n_remaining_channels))
115 |             self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
116 |         self.n_remaining_channels = n_remaining_channels  # Useful during inference
117 | 
118 |     def forward(self, forward_input):
119 |         return None
120 |         """
121 |         forward_input[0] = audio: batch x time
122 |         forward_input[1] = upsamp_spectrogram:  batch x n_cond_channels x time
123 |         """
124 |         """
125 |         spect, audio = forward_input
126 | 
127 |         #  Upsample spectrogram to size of audio
128 |         spect = self.upsample(spect)
129 |         assert(spect.size(2) >= audio.size(1))
130 |         if spect.size(2) > audio.size(1):
131 |             spect = spect[:, :, :audio.size(1)]
132 | 
133 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
134 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
135 | 
136 |         audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
137 |         output_audio = []
138 |         s_list = []
139 |         s_conv_list = []
140 | 
141 |         for k in range(self.n_flows):
142 |             if k%4 == 0 and k > 0:
143 |                 output_audio.append(audio[:,:self.n_multi,:])
144 |                 audio = audio[:,self.n_multi:,:]
145 | 
146 |             # project to new basis
147 |             audio, s = self.convinv[k](audio)
148 |             s_conv_list.append(s)
149 | 
150 |             n_half = int(audio.size(1)/2)
151 |             if k%2 == 0:
152 |                 audio_0 = audio[:,:n_half,:]
153 |                 audio_1 = audio[:,n_half:,:]
154 |             else:
155 |                 audio_1 = audio[:,:n_half,:]
156 |                 audio_0 = audio[:,n_half:,:]
157 | 
158 |             output = self.nn[k]((audio_0, spect))
159 |             s = output[:, n_half:, :]
160 |             b = output[:, :n_half, :]
161 |             audio_1 = torch.exp(s)*audio_1 + b
162 |             s_list.append(s)
163 | 
164 |             if k%2 == 0:
165 |                 audio = torch.cat([audio[:,:n_half,:], audio_1],1)
166 |             else:
167 |                 audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
168 |         output_audio.append(audio)
169 |         return torch.cat(output_audio,1), s_list, s_conv_list
170 |         """
171 | 
172 |     def infer(self, spect, sigma=1.0):
173 |         spect = self.upsample(spect)
174 |         # trim conv artifacts. maybe pad spec to kernel multiple
175 |         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
176 |         spect = spect[:, :, :-time_cutoff]
177 | 
178 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
179 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
180 | 
181 |         if spect.type() == 'torch.cuda.HalfTensor':
182 |             audio = torch.cuda.HalfTensor(spect.size(0),
183 |                                           self.n_remaining_channels,
184 |                                           spect.size(2)).normal_()
185 |         else:
186 |             audio = torch.cuda.FloatTensor(spect.size(0),
187 |                                            self.n_remaining_channels,
188 |                                            spect.size(2)).normal_()
189 | 
190 |         audio = torch.autograd.Variable(sigma*audio)
191 | 
192 |         for k in reversed(range(self.n_flows)):
193 |             n_half = int(audio.size(1)/2)
194 |             if k%2 == 0:
195 |                 audio_0 = audio[:,:n_half,:]
196 |                 audio_1 = audio[:,n_half:,:]
197 |             else:
198 |                 audio_1 = audio[:,:n_half,:]
199 |                 audio_0 = audio[:,n_half:,:]
200 | 
201 |             output = self.WN[k]((audio_0, spect))
202 |             s = output[:, n_half:, :]
203 |             b = output[:, :n_half, :]
204 |             audio_1 = (audio_1 - b)/torch.exp(s)
205 |             if k%2 == 0:
206 |                 audio = torch.cat([audio[:,:n_half,:], audio_1],1)
207 |             else:
208 |                 audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
209 | 
210 |             audio = self.convinv[k](audio, reverse=True)
211 | 
212 |             if k%4 == 0 and k > 0:
213 |                 if spect.type() == 'torch.cuda.HalfTensor':
214 |                     z = torch.cuda.HalfTensor(spect.size(0),
215 |                                               self.n_early_size,
216 |                                               spect.size(2)).normal_()
217 |                 else:
218 |                     z = torch.cuda.FloatTensor(spect.size(0),
219 |                                                self.n_early_size,
220 |                                                spect.size(2)).normal_()
221 |                 audio = torch.cat((sigma*z, audio),1)
222 | 
223 |         return audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
224 | 
225 |     @staticmethod
226 |     def remove_weightnorm(model):
227 |         waveglow = model
228 |         for WN in waveglow.WN:
229 |             WN.start = torch.nn.utils.remove_weight_norm(WN.start)
230 |             WN.in_layers = remove(WN.in_layers)
231 |             WN.cond_layers = remove(WN.cond_layers)
232 |             WN.res_skip_layers = remove(WN.res_skip_layers)
233 |         return waveglow
234 | 


--------------------------------------------------------------------------------
/waveglow/inference.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | #  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | import os
28 | from scipy.io.wavfile import write
29 | import torch
30 | from mel2samp import files_to_list, MAX_WAV_VALUE
31 | from denoiser import Denoiser
32 | 
33 | 
34 | def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
35 |          denoiser_strength):
36 |     mel_files = files_to_list(mel_files)
37 |     waveglow = torch.load(waveglow_path)['model']
38 |     waveglow = waveglow.remove_weightnorm(waveglow)
39 |     waveglow.cuda().eval()
40 |     if is_fp16:
41 |         from apex import amp
42 |         waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")
43 | 
44 |     if denoiser_strength > 0:
45 |         denoiser = Denoiser(waveglow).cuda()
46 | 
47 |     for i, file_path in enumerate(mel_files):
48 |         file_name = os.path.splitext(os.path.basename(file_path))[0]
49 |         mel = torch.load(file_path)
50 |         mel = torch.autograd.Variable(mel.cuda())
51 |         mel = torch.unsqueeze(mel, 0)
52 |         mel = mel.half() if is_fp16 else mel
53 |         with torch.no_grad():
54 |             audio = waveglow.infer(mel, sigma=sigma)
55 |             if denoiser_strength > 0:
56 |                 audio = denoiser(audio, denoiser_strength)
57 |             audio = audio * MAX_WAV_VALUE
58 |         audio = audio.squeeze()
59 |         audio = audio.cpu().numpy()
60 |         audio = audio.astype('int16')
61 |         audio_path = os.path.join(
62 |             output_dir, "{}_synthesis.wav".format(file_name))
63 |         write(audio_path, sampling_rate, audio)
64 |         print(audio_path)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     import argparse
69 | 
70 |     parser = argparse.ArgumentParser()
71 |     parser.add_argument('-f', "--filelist_path", required=True)
72 |     parser.add_argument('-w', '--waveglow_path',
73 |                         help='Path to waveglow decoder checkpoint with model')
74 |     parser.add_argument('-o', "--output_dir", required=True)
75 |     parser.add_argument("-s", "--sigma", default=1.0, type=float)
76 |     parser.add_argument("--sampling_rate", default=22050, type=int)
77 |     parser.add_argument("--is_fp16", action="store_true")
78 |     parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float,
79 |                         help='Removes model bias. Start with 0.1 and adjust')
80 | 
81 |     args = parser.parse_args()
82 | 
83 |     main(args.filelist_path, args.waveglow_path, args.sigma, args.output_dir,
84 |          args.sampling_rate, args.is_fp16, args.denoiser_strength)
85 | 


--------------------------------------------------------------------------------
/waveglow/mel2samp.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************\
 27 | import os
 28 | import random
 29 | import argparse
 30 | import json
 31 | import torch
 32 | import torch.utils.data
 33 | import sys
 34 | from scipy.io.wavfile import read
 35 | 
 36 | # We're using the audio processing from TacoTron2 to make sure it matches
 37 | sys.path.insert(0, 'tacotron2')
 38 | from tacotron2.layers import TacotronSTFT
 39 | 
 40 | MAX_WAV_VALUE = 32768.0
 41 | 
 42 | def files_to_list(filename):
 43 |     """
 44 |     Takes a text file of filenames and makes a list of filenames
 45 |     """
 46 |     with open(filename, encoding='utf-8') as f:
 47 |         files = f.readlines()
 48 | 
 49 |     files = [f.rstrip() for f in files]
 50 |     return files
 51 | 
 52 | def load_wav_to_torch(full_path):
 53 |     """
 54 |     Loads wavdata into torch array
 55 |     """
 56 |     sampling_rate, data = read(full_path)
 57 |     return torch.from_numpy(data).float(), sampling_rate
 58 | 
 59 | 
 60 | class Mel2Samp(torch.utils.data.Dataset):
 61 |     """
 62 |     This is the main class that calculates the spectrogram and returns the
 63 |     spectrogram, audio pair.
 64 |     """
 65 |     def __init__(self, training_files, segment_length, filter_length,
 66 |                  hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
 67 |         self.audio_files = files_to_list(training_files)
 68 |         random.seed(1234)
 69 |         random.shuffle(self.audio_files)
 70 |         self.stft = TacotronSTFT(filter_length=filter_length,
 71 |                                  hop_length=hop_length,
 72 |                                  win_length=win_length,
 73 |                                  sampling_rate=sampling_rate,
 74 |                                  mel_fmin=mel_fmin, mel_fmax=mel_fmax)
 75 |         self.segment_length = segment_length
 76 |         self.sampling_rate = sampling_rate
 77 | 
 78 |     def get_mel(self, audio):
 79 |         audio_norm = audio / MAX_WAV_VALUE
 80 |         audio_norm = audio_norm.unsqueeze(0)
 81 |         audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
 82 |         melspec = self.stft.mel_spectrogram(audio_norm)
 83 |         melspec = torch.squeeze(melspec, 0)
 84 |         return melspec
 85 | 
 86 |     def __getitem__(self, index):
 87 |         # Read audio
 88 |         filename = self.audio_files[index]
 89 |         audio, sampling_rate = load_wav_to_torch(filename)
 90 |         if sampling_rate != self.sampling_rate:
 91 |             raise ValueError("{} SR doesn't match target {} SR".format(
 92 |                 sampling_rate, self.sampling_rate))
 93 | 
 94 |         # Take segment
 95 |         if audio.size(0) >= self.segment_length:
 96 |             max_audio_start = audio.size(0) - self.segment_length
 97 |             audio_start = random.randint(0, max_audio_start)
 98 |             audio = audio[audio_start:audio_start+self.segment_length]
 99 |         else:
100 |             audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
101 | 
102 |         mel = self.get_mel(audio)
103 |         audio = audio / MAX_WAV_VALUE
104 | 
105 |         return (mel, audio)
106 | 
107 |     def __len__(self):
108 |         return len(self.audio_files)
109 | 
110 | # ===================================================================
111 | # Takes directory of clean audio and makes directory of spectrograms
112 | # Useful for making test sets
113 | # ===================================================================
114 | if __name__ == "__main__":
115 |     # Get defaults so it can work with no Sacred
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('-f', "--filelist_path", required=True)
118 |     parser.add_argument('-c', '--config', type=str,
119 |                         help='JSON file for configuration')
120 |     parser.add_argument('-o', '--output_dir', type=str,
121 |                         help='Output directory')
122 |     args = parser.parse_args()
123 | 
124 |     with open(args.config) as f:
125 |         data = f.read()
126 |     data_config = json.loads(data)["data_config"]
127 |     mel2samp = Mel2Samp(**data_config)
128 | 
129 |     filepaths = files_to_list(args.filelist_path)
130 | 
131 |     # Make directory if it doesn't exist
132 |     if not os.path.isdir(args.output_dir):
133 |         os.makedirs(args.output_dir)
134 |         os.chmod(args.output_dir, 0o775)
135 | 
136 |     for filepath in filepaths:
137 |         audio, sr = load_wav_to_torch(filepath)
138 |         melspectrogram = mel2samp.get_mel(audio)
139 |         filename = os.path.basename(filepath)
140 |         new_filepath = args.output_dir + '/' + filename + '.pt'
141 |         print(new_filepath)
142 |         torch.save(melspectrogram, new_filepath)
143 | 


--------------------------------------------------------------------------------
/waveglow/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.0
 2 | matplotlib==2.1.0
 3 | tensorflow
 4 | numpy==1.22.0
 5 | inflect==0.2.5
 6 | librosa==0.6.0
 7 | scipy==1.0.0
 8 | tensorboardX==1.1
 9 | Unidecode==1.0.22
10 | pillow
11 | 


--------------------------------------------------------------------------------
/waveglow/train.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import argparse
 28 | import json
 29 | import os
 30 | import torch
 31 | 
 32 | #=====START: ADDED FOR DISTRIBUTED======
 33 | from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
 34 | from torch.utils.data.distributed import DistributedSampler
 35 | #=====END:   ADDED FOR DISTRIBUTED======
 36 | 
 37 | from torch.utils.data import DataLoader
 38 | from glow import WaveGlow, WaveGlowLoss
 39 | from mel2samp import Mel2Samp
 40 | 
 41 | def load_checkpoint(checkpoint_path, model, optimizer):
 42 |     assert os.path.isfile(checkpoint_path)
 43 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 44 |     iteration = checkpoint_dict['iteration']
 45 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
 46 |     model_for_loading = checkpoint_dict['model']
 47 |     model.load_state_dict(model_for_loading.state_dict())
 48 |     print("Loaded checkpoint '{}' (iteration {})" .format(
 49 |           checkpoint_path, iteration))
 50 |     return model, optimizer, iteration
 51 | 
 52 | def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
 53 |     print("Saving model and optimizer state at iteration {} to {}".format(
 54 |           iteration, filepath))
 55 |     model_for_saving = WaveGlow(**waveglow_config).cuda()
 56 |     model_for_saving.load_state_dict(model.state_dict())
 57 |     torch.save({'model': model_for_saving,
 58 |                 'iteration': iteration,
 59 |                 'optimizer': optimizer.state_dict(),
 60 |                 'learning_rate': learning_rate}, filepath)
 61 | 
 62 | def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
 63 |           sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
 64 |           checkpoint_path, with_tensorboard):
 65 |     torch.manual_seed(seed)
 66 |     torch.cuda.manual_seed(seed)
 67 |     #=====START: ADDED FOR DISTRIBUTED======
 68 |     if num_gpus > 1:
 69 |         init_distributed(rank, num_gpus, group_name, **dist_config)
 70 |     #=====END:   ADDED FOR DISTRIBUTED======
 71 | 
 72 |     criterion = WaveGlowLoss(sigma)
 73 |     model = WaveGlow(**waveglow_config).cuda()
 74 | 
 75 |     #=====START: ADDED FOR DISTRIBUTED======
 76 |     if num_gpus > 1:
 77 |         model = apply_gradient_allreduce(model)
 78 |     #=====END:   ADDED FOR DISTRIBUTED======
 79 | 
 80 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
 81 | 
 82 |     if fp16_run:
 83 |         from apex import amp
 84 |         model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
 85 | 
 86 |     # Load checkpoint if one exists
 87 |     iteration = 0
 88 |     if checkpoint_path != "":
 89 |         model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
 90 |                                                       optimizer)
 91 |         iteration += 1  # next iteration is iteration + 1
 92 | 
 93 |     trainset = Mel2Samp(**data_config)
 94 |     # =====START: ADDED FOR DISTRIBUTED======
 95 |     train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
 96 |     # =====END:   ADDED FOR DISTRIBUTED======
 97 |     train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
 98 |                               sampler=train_sampler,
 99 |                               batch_size=batch_size,
100 |                               pin_memory=False,
101 |                               drop_last=True)
102 | 
103 |     # Get shared output_directory ready
104 |     if rank == 0:
105 |         if not os.path.isdir(output_directory):
106 |             os.makedirs(output_directory)
107 |             os.chmod(output_directory, 0o775)
108 |         print("output directory", output_directory)
109 | 
110 |     if with_tensorboard and rank == 0:
111 |         from tensorboardX import SummaryWriter
112 |         logger = SummaryWriter(os.path.join(output_directory, 'logs'))
113 | 
114 |     model.train()
115 |     epoch_offset = max(0, int(iteration / len(train_loader)))
116 |     # ================ MAIN TRAINNIG LOOP! ===================
117 |     for epoch in range(epoch_offset, epochs):
118 |         print("Epoch: {}".format(epoch))
119 |         for i, batch in enumerate(train_loader):
120 |             model.zero_grad()
121 | 
122 |             mel, audio = batch
123 |             mel = torch.autograd.Variable(mel.cuda())
124 |             audio = torch.autograd.Variable(audio.cuda())
125 |             outputs = model((mel, audio))
126 | 
127 |             loss = criterion(outputs)
128 |             if num_gpus > 1:
129 |                 reduced_loss = reduce_tensor(loss.data, num_gpus).item()
130 |             else:
131 |                 reduced_loss = loss.item()
132 | 
133 |             if fp16_run:
134 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
135 |                     scaled_loss.backward()
136 |             else:
137 |                 loss.backward()
138 | 
139 |             optimizer.step()
140 | 
141 |             print("{}:\t{:.9f}".format(iteration, reduced_loss))
142 |             if with_tensorboard and rank == 0:
143 |                 logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)
144 | 
145 |             if (iteration % iters_per_checkpoint == 0):
146 |                 if rank == 0:
147 |                     checkpoint_path = "{}/waveglow_{}".format(
148 |                         output_directory, iteration)
149 |                     save_checkpoint(model, optimizer, learning_rate, iteration,
150 |                                     checkpoint_path)
151 | 
152 |             iteration += 1
153 | 
154 | if __name__ == "__main__":
155 |     parser = argparse.ArgumentParser()
156 |     parser.add_argument('-c', '--config', type=str,
157 |                         help='JSON file for configuration')
158 |     parser.add_argument('-r', '--rank', type=int, default=0,
159 |                         help='rank of process for distributed')
160 |     parser.add_argument('-g', '--group_name', type=str, default='',
161 |                         help='name of group for distributed')
162 |     args = parser.parse_args()
163 | 
164 |     # Parse configs.  Globals nicer in this case
165 |     with open(args.config) as f:
166 |         data = f.read()
167 |     config = json.loads(data)
168 |     train_config = config["train_config"]
169 |     global data_config
170 |     data_config = config["data_config"]
171 |     global dist_config
172 |     dist_config = config["dist_config"]
173 |     global waveglow_config
174 |     waveglow_config = config["waveglow_config"]
175 | 
176 |     num_gpus = torch.cuda.device_count()
177 |     if num_gpus > 1:
178 |         if args.group_name == '':
179 |             print("WARNING: Multiple GPUs detected but no distributed group set")
180 |             print("Only running 1 GPU.  Use distributed.py for multiple GPUs")
181 |             num_gpus = 1
182 | 
183 |     if num_gpus == 1 and args.rank != 0:
184 |         raise Exception("Doing single GPU training on rank > 0")
185 | 
186 |     torch.backends.cudnn.enabled = True
187 |     torch.backends.cudnn.benchmark = False
188 |     train(num_gpus, args.rank, args.group_name, **train_config)
189 | 


--------------------------------------------------------------------------------
/waveglow/waveglow_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LEEYOONHYUNG/BVAE-TTS/1851f17a61a57daf2ca42f5474352162c1e7fcb2/waveglow/waveglow_logo.png


--------------------------------------------------------------------------------