├── .gitignore ├── LICENSE ├── LJSpeech-1.1 ├── training.txt └── validation.txt ├── README.md ├── config_c32.json ├── discriminator.py ├── generator.py ├── loss.py ├── meldataset.py ├── modules.py ├── stft_loss.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Rishikesh (ऋषिकेश) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LJSpeech-1.1/validation.txt: -------------------------------------------------------------------------------- 1 | LJ050-0269|The essential terms of such memoranda might well be embodied in an Executive order.|The essential terms of such memoranda might well be embodied in an Executive order. 2 | LJ050-0270|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security. 3 | LJ050-0271|The demands on the President in the execution of His responsibilities in today's world are so varied and complex|The demands on the President in the execution of His responsibilities in today's world are so varied and complex 4 | LJ050-0272|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security. 5 | LJ050-0273|The Commission has, however, from its examination of the facts of President Kennedy's assassination|The Commission has, however, from its examination of the facts of President Kennedy's assassination 6 | LJ050-0274|made certain recommendations which it believes would, if adopted,|made certain recommendations which it believes would, if adopted, 7 | LJ050-0275|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger. 8 | LJ050-0276|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that, 9 | LJ050-0277|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President, 10 | LJ050-0278|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties. 11 | LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities. 12 | LJ001-0068|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic. 13 | LJ002-0149|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo. 14 | LJ002-0157|and Susannah Evans, in October the same year, for 2 shillings, with costs of 6 shillings, 8 pence.|and Susannah Evans, in October the same year, for two shillings, with costs of six shillings, eight pence. 15 | LJ002-0167|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny. 16 | LJ003-0042|The completion of this very necessary building was, however, much delayed for want of funds,|The completion of this very necessary building was, however, much delayed for want of funds, 17 | LJ003-0307|but as yet no suggestion was made to provide prison uniform.|but as yet no suggestion was made to provide prison uniform. 18 | LJ004-0169|On the dirty bedstead lay a wretched being in the throes of severe illness.|On the dirty bedstead lay a wretched being in the throes of severe illness. 19 | LJ004-0233|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating. 20 | LJ005-0101|whence it deduced the practice and condition of every prison that replied.|whence it deduced the practice and condition of every prison that replied. 21 | LJ005-0108|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison, 22 | LJ005-0202|An examination of this report shows how even the most insignificant township had its jail.|An examination of this report shows how even the most insignificant township had its jail. 23 | LJ005-0234|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer. 24 | LJ005-0248|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners. 25 | LJ006-0001|The Chronicles of Newgate, Volume 2. By Arthur Griffiths. Section 9: The first report of the inspector of prisons.|The Chronicles of Newgate, Volume two. By Arthur Griffiths. Section nine: The first report of the inspector of prisons. 26 | LJ006-0018|One was Mr. William Crawford, the other the Rev. Whitworth Russell.|One was Mr. William Crawford, the other the Rev. Whitworth Russell. 27 | LJ006-0034|They attended early and late; they mustered the prisoners, examined into their condition,|They attended early and late; they mustered the prisoners, examined into their condition, 28 | LJ006-0078|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner. 29 | LJ007-0217|They go on to say|They go on to say 30 | LJ007-0243|It was not till the erection of the new prison at Holloway in 1850, and the entire internal reconstruction of Newgate according to new ideas,|It was not till the erection of the new prison at Holloway in eighteen fifty, and the entire internal reconstruction of Newgate according to new ideas, 31 | LJ008-0087|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor. 32 | LJ008-0131|the other he kept between his hands.|the other he kept between his hands. 33 | LJ008-0140|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity, 34 | LJ008-0158|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion. 35 | LJ008-0174|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death. 36 | LJ010-0047|while in 1850 Her Majesty was the victim of another outrage at the hands of one Pate.|while in eighteen fifty Her Majesty was the victim of another outrage at the hands of one Pate. 37 | LJ010-0061|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt. 38 | LJ010-0105|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.|Thistlewood was discovered next morning in a mean house in White Street, Moorfields. 39 | LJ010-0233|Here again probably it was partly the love of notoriety which was the incentive,|Here again probably it was partly the love of notoriety which was the incentive, 40 | LJ010-0234|backed possibly with the hope that, as in a much more recent case,|backed possibly with the hope that, as in a much more recent case, 41 | LJ010-0258|As the Queen was driving from Buckingham Palace to the Chapel Royal,|As the Queen was driving from Buckingham Palace to the Chapel Royal, 42 | LJ010-0262|charged him with the offense.|charged him with the offense. 43 | LJ010-0270|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.|exactly tallied with that of the deformed person "wanted" for the assault on the Queen. 44 | LJ010-0293|I have already remarked that as violence was more and more eliminated from crimes against the person,|I have already remarked that as violence was more and more eliminated from crimes against the person, 45 | LJ011-0009|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell. 46 | LJ011-0256|By this time the neighbors were aroused, and several people came to the scene of the affray.|By this time the neighbors were aroused, and several people came to the scene of the affray. 47 | LJ012-0044|When his trade was busiest he set up a second establishment, at the head of which, although he was married,|When his trade was busiest he set up a second establishment, at the head of which, although he was married, 48 | LJ012-0145|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft. 49 | LJ013-0020|he acted in a manner which excited the suspicions of the crew.|he acted in a manner which excited the suspicions of the crew. 50 | LJ013-0077|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done. 51 | LJ013-0228|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for. 52 | LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood; 53 | LJ014-0054|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone. 54 | LJ014-0101|he found that it was soft and new, while elsewhere it was set and hard.|he found that it was soft and new, while elsewhere it was set and hard. 55 | LJ014-0103|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe. 56 | LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art. 57 | LJ014-0272|and 1850 to embezzle and apply to his own purposes some £71,000.|and eighteen fifty to embezzle and apply to his own purposes some seventy-one thousand pounds. 58 | LJ014-0311|His extensive business had been carried on by fraud.|His extensive business had been carried on by fraud. 59 | LJ015-0197|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force. 60 | LJ016-0089|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery. 61 | LJ016-0407|who generally attended the prison services.|who generally attended the prison services. 62 | LJ016-0443|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched. 63 | LJ017-0033|a medical practitioner, charged with doing to death persons who relied upon his professional skill.|a medical practitioner, charged with doing to death persons who relied upon his professional skill. 64 | LJ017-0038|That the administration of justice should never be interfered with by local prejudice or local feeling|That the administration of justice should never be interfered with by local prejudice or local feeling 65 | LJ018-0018|he wore gold-rimmed eye-glasses and a gold watch and chain.|he wore gold-rimmed eye-glasses and a gold watch and chain. 66 | LJ018-0119|His offer was not, however, accepted.|His offer was not, however, accepted. 67 | LJ018-0280|The commercial experience of these clever rogues was cosmopolitan.|The commercial experience of these clever rogues was cosmopolitan. 68 | LJ019-0178|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet. 69 | LJ019-0240|But no structural alterations were made from the date first quoted until the time of closing the prison in 1881.|But no structural alterations were made from the date first quoted until the time of closing the prison in eighteen eighty-one. 70 | LJ021-0049|and the curtailment of rank stock speculation through the Securities Exchange Act.|and the curtailment of rank stock speculation through the Securities Exchange Act. 71 | LJ021-0155|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works. 72 | LJ022-0046|It is true that while business and industry are definitely better our relief rolls are still too large.|It is true that while business and industry are definitely better our relief rolls are still too large. 73 | LJ022-0173|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport, 74 | LJ024-0087|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution. 75 | LJ024-0110|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay 76 | LJ024-0119|When before have you found them really at your side in your fights for progress?|When before have you found them really at your side in your fights for progress? 77 | LJ025-0091|as it was current among contemporary chemists.|as it was current among contemporary chemists. 78 | LJ026-0029|so in the case under discussion.|so in the case under discussion. 79 | LJ026-0039|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent. 80 | LJ026-0064|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food. 81 | LJ026-0105|This is done by diastase, an enzyme of plant cells.|This is done by diastase, an enzyme of plant cells. 82 | LJ026-0137|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.|and be laid down as "reserve starch" in the cells of root or stem or elsewhere. 83 | LJ027-0006|In all these lines the facts are drawn together by a strong thread of unity.|In all these lines the facts are drawn together by a strong thread of unity. 84 | LJ028-0134|He also erected what is called a pensile paradise:|He also erected what is called a pensile paradise: 85 | LJ028-0138|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be, 86 | LJ028-0189|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world. 87 | LJ028-0281|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken, 88 | LJ029-0188|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater. 89 | LJ030-0098|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen, 90 | LJ031-0007|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital. 91 | LJ031-0091|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries. 92 | LJ031-0227|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital, 93 | LJ032-0100|Marina Oswald|Marina Oswald 94 | LJ032-0165|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.|to the exclusion of all others because there are not enough microscopic characteristics present in fibers. 95 | LJ032-0198|During the period from March 2, 1963, to April 24, 1963,|During the period from March two, nineteen sixty-three, to April twenty-four, nineteen sixty-three, 96 | LJ033-0046|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so. 97 | LJ033-0072|I then stepped off of it and the officer picked it up in the middle and it bent so.|I then stepped off of it and the officer picked it up in the middle and it bent so. 98 | LJ033-0135|Location of Bag|Location of Bag 99 | LJ034-0083|The significance of Givens' observation that Oswald was carrying his clipboard|The significance of Givens' observation that Oswald was carrying his clipboard 100 | LJ034-0179|and, quote, seemed to be sitting a little forward, end quote,|and, quote, seemed to be sitting a little forward, end quote, 101 | LJ035-0125|Victoria Adams, who worked on the fourth floor of the Depository Building,|Victoria Adams, who worked on the fourth floor of the Depository Building, 102 | LJ035-0162|approximately 30 to 45 seconds after Oswald's lunchroom encounter with Baker and Truly.|approximately thirty to forty-five seconds after Oswald's lunchroom encounter with Baker and Truly. 103 | LJ035-0189|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade, 104 | LJ035-0208|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor 105 | LJ036-0216|Tippit got out and started to walk around the front of the car|Tippit got out and started to walk around the front of the car 106 | LJ037-0093|William Arthur Smith was about a block east of 10th and Patton when he heard shots.|William Arthur Smith was about a block east of tenth and Patton when he heard shots. 107 | LJ037-0157|taken from Oswald.|taken from Oswald. 108 | LJ037-0178|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting, 109 | LJ037-0219|Oswald's Jacket|Oswald's Jacket 110 | LJ037-0222|When Oswald was arrested, he did not have a jacket.|When Oswald was arrested, he did not have a jacket. 111 | LJ038-0017|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb. 112 | LJ038-0052|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight. 113 | LJ038-0077|Statements of Oswald during Detention.|Statements of Oswald during Detention. 114 | LJ038-0161|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm. 115 | LJ038-0212|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear. 116 | LJ039-0103|Oswald, like all Marine recruits, received training on the rifle range at distances up to 500 yards,|Oswald, like all Marine recruits, received training on the rifle range at distances up to five hundred yards, 117 | LJ039-0149|established that they had been previously loaded and ejected from the assassination rifle,|established that they had been previously loaded and ejected from the assassination rifle, 118 | LJ040-0107|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of 5 and 7 years,|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of five and seven years, 119 | LJ040-0119|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave. 120 | LJ040-0161|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic. 121 | LJ040-0169|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone 122 | LJ041-0098|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote. 123 | LJ042-0017|and see for himself how a revolutionary society operates, a Marxist society.|and see for himself how a revolutionary society operates, a Marxist society. 124 | LJ042-0070|Oswald was discovered in time to thwart his attempt at suicide.|Oswald was discovered in time to thwart his attempt at suicide. 125 | LJ042-0161|Immediately after serving out his 3 years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.|Immediately after serving out his three years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR. 126 | LJ043-0147|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle. 127 | LJ043-0178|as, in fact, one of them did appear after the assassination.|as, in fact, one of them did appear after the assassination. 128 | LJ043-0183|Oswald did not lack the determination and other traits required|Oswald did not lack the determination and other traits required 129 | LJ043-0185|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker. 130 | LJ044-0057|extensive investigation was not able to connect Oswald with that address, although it did develop the fact|extensive investigation was not able to connect Oswald with that address, although it did develop the fact 131 | LJ044-0109|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities. 132 | LJ045-0081|Although she denied it in some of her testimony before the Commission,|Although she denied it in some of her testimony before the Commission, 133 | LJ045-0147|She asked Oswald, quote,|She asked Oswald, quote, 134 | LJ045-0204|he had never found anything to which he felt he could really belong.|he had never found anything to which he felt he could really belong. 135 | LJ046-0193|and 12 to 15 of these cases as highly dangerous risks.|and twelve to fifteen of these cases as highly dangerous risks. 136 | LJ046-0244|PRS should have investigated and been prepared to guard against it.|PRS should have investigated and been prepared to guard against it. 137 | LJ047-0059|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries, 138 | LJ047-0142|The Bureau had no earlier information suggesting that Oswald had left the United States.|The Bureau had no earlier information suggesting that Oswald had left the United States. 139 | LJ048-0035|It was against this background and consistent with the criteria followed by the FBI prior to November 22|It was against this background and consistent with the criteria followed by the FBI prior to November twenty-two 140 | LJ048-0063|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination. 141 | LJ048-0104|There were far safer routes via freeways directly to the Trade Mart,|There were far safer routes via freeways directly to the Trade Mart, 142 | LJ048-0187|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators. 143 | LJ048-0271|will be cause for removal from the Service, end quote.|will be cause for removal from the Service, end quote. 144 | LJ049-0031|The Presidential vehicle in use in Dallas, described in chapter 2,|The Presidential vehicle in use in Dallas, described in chapter two, 145 | LJ049-0059|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger, 146 | LJ049-0174|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated 147 | LJ050-0049|and from a specialist in psychiatric prognostication at Walter Reed Hospital.|and from a specialist in psychiatric prognostication at Walter Reed Hospital. 148 | LJ050-0113|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service, 149 | LJ050-0150|Its present manual filing system is obsolete;|Its present manual filing system is obsolete; 150 | LJ050-0189|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described. 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UnivNet 2 | [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation.](https://arxiv.org/abs/2106.07889) 3 | 4 | ## Training 5 | ``` 6 | python train.py --config config_c32.json 7 | ``` 8 | ## Citation 9 | ``` 10 | @misc{seo2021controlling, 11 | title={Controlling Neural Networks with Rule Representations}, 12 | author={Sungyong Seo and Sercan O. Arik and Jinsung Yoon and Xiang Zhang and Kihyuk Sohn and Tomas Pfister}, 13 | year={2021}, 14 | eprint={2106.07804}, 15 | archivePrefix={arXiv}, 16 | primaryClass={cs.LG} 17 | } 18 | ``` 19 | 20 | ## Note 21 | * For more complete and end to end Voice cloning or Text to Speech (TTS) toolbox 🧰 please visit [Deepsync Technologies](https://deepsync.co/). 22 | 23 | ## References: 24 | * [Hi-Fi-GAN repo](https://github.com/jik876/hifi-gan) 25 | * [LVCNet repo](https://github.com/ZENGZHEN-TTS/LVCNet) 26 | -------------------------------------------------------------------------------- /config_c32.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 0, 3 | "batch_size": 16, 4 | "learning_rate": 0.0002, 5 | "adam_b1": 0.8, 6 | "adam_b2": 0.99, 7 | "lr_decay": 0.999, 8 | "seed": 1234, 9 | "disc_start_step": 400000, 10 | "lambda_aux": 2.5, 11 | 12 | "upsample_rates": [8,8,4], 13 | "cond_in_channels": 64, 14 | "out_channels": 1, 15 | "cg_channels": 32, 16 | "num_lvc_blocks": 4, 17 | "lvc_kernels": 3, 18 | "lvc_hidden_channels": 64, 19 | "lvc_conv_size": 3, 20 | "dropout": 0.0, 21 | 22 | 23 | "segment_size": 8192, 24 | "num_mels": 80, 25 | "num_freq": 1025, 26 | "n_fft": 1024, 27 | "hop_size": 256, 28 | "win_size": 1024, 29 | 30 | "sampling_rate": 22050, 31 | 32 | "fmin": 0, 33 | "fmax": 8000, 34 | "fmax_for_loss": null, 35 | 36 | "num_workers": 4, 37 | 38 | "dist_config": { 39 | "dist_backend": "nccl", 40 | "dist_url": "tcp://localhost:54321", 41 | "world_size": 1 42 | } 43 | } -------------------------------------------------------------------------------- /discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | from torch.nn import Conv1d, AvgPool1d, Conv2d 5 | from torch.nn.utils import weight_norm, spectral_norm 6 | from utils import get_padding 7 | from stft_loss import stft 8 | 9 | LRELU_SLOPE = 0.1 10 | 11 | 12 | 13 | class SpecDiscriminator(nn.Module): 14 | """docstring for Discriminator.""" 15 | 16 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False): 17 | super(SpecDiscriminator, self).__init__() 18 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 19 | self.fft_size = fft_size 20 | self.shift_size = shift_size 21 | self.win_length = win_length 22 | self.window = getattr(torch, window)(win_length) 23 | self.discriminators = nn.ModuleList([ 24 | norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))), 25 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))), 26 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))), 27 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))), 28 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1,1), padding=(1, 1))), 29 | ]) 30 | 31 | self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1)) 32 | 33 | def forward(self, y): 34 | 35 | fmap = [] 36 | with torch.no_grad(): 37 | y = y.squeeze(1) 38 | y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.get_device())) 39 | y = y.unsqueeze(1) 40 | for i, d in enumerate(self.discriminators): 41 | y = d(y) 42 | y = F.leaky_relu(y, LRELU_SLOPE) 43 | fmap.append(y) 44 | 45 | y = self.out(y) 46 | fmap.append(y) 47 | 48 | return torch.flatten(y, 1, -1), fmap 49 | 50 | class MultiResSpecDiscriminator(torch.nn.Module): 51 | 52 | def __init__(self, 53 | fft_sizes=[1024, 2048, 512], 54 | hop_sizes=[120, 240, 50], 55 | win_lengths=[600, 1200, 240], 56 | window="hann_window"): 57 | 58 | super(MultiResSpecDiscriminator, self).__init__() 59 | self.discriminators = nn.ModuleList([ 60 | SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window), 61 | SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window), 62 | SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window) 63 | ]) 64 | 65 | def forward(self, y, y_hat): 66 | y_d_rs = [] 67 | y_d_gs = [] 68 | fmap_rs = [] 69 | fmap_gs = [] 70 | for i, d in enumerate(self.discriminators): 71 | y_d_r, fmap_r = d(y) 72 | y_d_g, fmap_g = d(y_hat) 73 | y_d_rs.append(y_d_r) 74 | fmap_rs.append(fmap_r) 75 | y_d_gs.append(y_d_g) 76 | fmap_gs.append(fmap_g) 77 | 78 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 79 | 80 | 81 | class DiscriminatorP(torch.nn.Module): 82 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 83 | super(DiscriminatorP, self).__init__() 84 | self.period = period 85 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 86 | self.convs = nn.ModuleList([ 87 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 88 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 89 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 90 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 91 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), 92 | ]) 93 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 94 | 95 | def forward(self, x): 96 | fmap = [] 97 | 98 | # 1d to 2d 99 | b, c, t = x.shape 100 | if t % self.period != 0: # pad first 101 | n_pad = self.period - (t % self.period) 102 | x = F.pad(x, (0, n_pad), "reflect") 103 | t = t + n_pad 104 | x = x.view(b, c, t // self.period, self.period) 105 | 106 | for l in self.convs: 107 | x = l(x) 108 | x = F.leaky_relu(x, LRELU_SLOPE) 109 | fmap.append(x) 110 | x = self.conv_post(x) 111 | fmap.append(x) 112 | x = torch.flatten(x, 1, -1) 113 | 114 | return x, fmap 115 | 116 | 117 | class MultiPeriodDiscriminator(torch.nn.Module): 118 | def __init__(self): 119 | super(MultiPeriodDiscriminator, self).__init__() 120 | self.discriminators = nn.ModuleList([ 121 | DiscriminatorP(2), 122 | DiscriminatorP(3), 123 | DiscriminatorP(5), 124 | DiscriminatorP(7), 125 | DiscriminatorP(11), 126 | ]) 127 | 128 | def forward(self, y, y_hat): 129 | y_d_rs = [] 130 | y_d_gs = [] 131 | fmap_rs = [] 132 | fmap_gs = [] 133 | for i, d in enumerate(self.discriminators): 134 | y_d_r, fmap_r = d(y) 135 | y_d_g, fmap_g = d(y_hat) 136 | y_d_rs.append(y_d_r) 137 | fmap_rs.append(fmap_r) 138 | y_d_gs.append(y_d_g) 139 | fmap_gs.append(fmap_g) 140 | 141 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 142 | 143 | 144 | class DiscriminatorS(torch.nn.Module): 145 | def __init__(self, use_spectral_norm=False): 146 | super(DiscriminatorS, self).__init__() 147 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 148 | self.convs = nn.ModuleList([ 149 | norm_f(Conv1d(1, 128, 15, 1, padding=7)), 150 | norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), 151 | norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), 152 | norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), 153 | norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), 154 | norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), 155 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 156 | ]) 157 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 158 | 159 | def forward(self, x): 160 | fmap = [] 161 | for l in self.convs: 162 | x = l(x) 163 | x = F.leaky_relu(x, LRELU_SLOPE) 164 | fmap.append(x) 165 | x = self.conv_post(x) 166 | fmap.append(x) 167 | x = torch.flatten(x, 1, -1) 168 | 169 | return x, fmap 170 | 171 | 172 | class MultiScaleDiscriminator(torch.nn.Module): 173 | def __init__(self): 174 | super(MultiScaleDiscriminator, self).__init__() 175 | self.discriminators = nn.ModuleList([ 176 | DiscriminatorS(use_spectral_norm=True), 177 | DiscriminatorS(), 178 | DiscriminatorS(), 179 | ]) 180 | self.meanpools = nn.ModuleList([ 181 | AvgPool1d(4, 2, padding=2), 182 | AvgPool1d(4, 2, padding=2) 183 | ]) 184 | 185 | def forward(self, y, y_hat): 186 | y_d_rs = [] 187 | y_d_gs = [] 188 | fmap_rs = [] 189 | fmap_gs = [] 190 | for i, d in enumerate(self.discriminators): 191 | if i != 0: 192 | y = self.meanpools[i-1](y) 193 | y_hat = self.meanpools[i-1](y_hat) 194 | y_d_r, fmap_r = d(y) 195 | y_d_g, fmap_g = d(y_hat) 196 | y_d_rs.append(y_d_r) 197 | fmap_rs.append(fmap_r) 198 | y_d_gs.append(y_d_g) 199 | fmap_gs.append(fmap_g) 200 | 201 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs -------------------------------------------------------------------------------- /generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | from modules import LVCBlock 4 | import torch.nn.functional as F 5 | 6 | LRELU_SLOPE = 0.1 7 | 8 | class UnivNet(torch.nn.Module): 9 | """Parallel WaveGAN Generator module.""" 10 | 11 | def __init__(self, h, use_weight_norm=True): 12 | 13 | super().__init__() 14 | in_channels = h.cond_in_channels 15 | out_channels = h.out_channels 16 | inner_channels = h.cg_channels 17 | cond_channels = h.num_mels 18 | upsample_ratios = h.upsample_rates 19 | lvc_layers_each_block = h.num_lvc_blocks 20 | lvc_kernel_size = h.lvc_kernels 21 | kpnet_hidden_channels = h.lvc_hidden_channels 22 | kpnet_conv_size = h.lvc_conv_size 23 | dropout = h.dropout 24 | 25 | 26 | self.in_channels = in_channels 27 | self.out_channels = out_channels 28 | self.cond_channels = cond_channels 29 | self.lvc_block_nums = len(upsample_ratios) 30 | 31 | # define first convolution 32 | self.first_conv = torch.nn.Conv1d(in_channels, inner_channels, 33 | kernel_size=7, padding=(7 - 1) // 2, 34 | dilation=1, bias=True) 35 | 36 | # define residual blocks 37 | self.lvc_blocks = torch.nn.ModuleList() 38 | cond_hop_length = 1 39 | for n in range(self.lvc_block_nums): 40 | cond_hop_length = cond_hop_length * upsample_ratios[n] 41 | lvcb = LVCBlock( 42 | in_channels=inner_channels, 43 | cond_channels=cond_channels, 44 | upsample_ratio=upsample_ratios[n], 45 | conv_layers=lvc_layers_each_block, 46 | conv_kernel_size=lvc_kernel_size, 47 | cond_hop_length=cond_hop_length, 48 | kpnet_hidden_channels=kpnet_hidden_channels, 49 | kpnet_conv_size=kpnet_conv_size, 50 | kpnet_dropout=dropout, 51 | ) 52 | self.lvc_blocks += [lvcb] 53 | 54 | # define output layers 55 | self.last_conv_layers = torch.nn.ModuleList([ 56 | torch.nn.Conv1d(inner_channels, out_channels, kernel_size=7, padding=(7 - 1) // 2, 57 | dilation=1, bias=True), 58 | 59 | ]) 60 | 61 | # apply weight norm 62 | if use_weight_norm: 63 | self.apply_weight_norm() 64 | 65 | def forward(self, x, c): 66 | """Calculate forward propagation. 67 | Args: 68 | x (Tensor): Input noise signal (B, 1, T). 69 | c (Tensor): Local conditioning auxiliary features (B, C ,T'). 70 | Returns: 71 | Tensor: Output tensor (B, out_channels, T) 72 | """ 73 | 74 | x = self.first_conv(x) 75 | 76 | for n in range(self.lvc_block_nums): 77 | x = self.lvc_blocks[n](x, c) 78 | 79 | # apply final layers 80 | for f in self.last_conv_layers: 81 | x = F.leaky_relu(x, LRELU_SLOPE) 82 | x = f(x) 83 | x = torch.tanh(x) 84 | return x 85 | 86 | def remove_weight_norm(self): 87 | """Remove weight normalization module from all of the layers.""" 88 | def _remove_weight_norm(m): 89 | try: 90 | logging.debug(f"Weight norm is removed from {m}.") 91 | torch.nn.utils.remove_weight_norm(m) 92 | except ValueError: # this module didn't have weight norm 93 | return 94 | 95 | self.apply(_remove_weight_norm) 96 | 97 | def apply_weight_norm(self): 98 | """Apply weight normalization module from all of the layers.""" 99 | def _apply_weight_norm(m): 100 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 101 | torch.nn.utils.weight_norm(m) 102 | logging.debug(f"Weight norm is applied to {m}.") 103 | 104 | self.apply(_apply_weight_norm) 105 | 106 | @staticmethod 107 | def _get_receptive_field_size(layers, stacks, kernel_size, 108 | dilation=lambda x: 2 ** x): 109 | assert layers % stacks == 0 110 | layers_per_cycle = layers // stacks 111 | dilations = [dilation(i % layers_per_cycle) for i in range(layers)] 112 | return (kernel_size - 1) * sum(dilations) + 1 113 | 114 | @property 115 | def receptive_field_size(self): 116 | """Return receptive field size.""" 117 | return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) 118 | 119 | def inference(self, c=None, x=None): 120 | """Perform inference. 121 | Args: 122 | c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C). 123 | x (Union[Tensor, ndarray]): Input noise signal (T, 1). 124 | Returns: 125 | Tensor: Output tensor (T, out_channels) 126 | """ 127 | if x is not None: 128 | if not isinstance(x, torch.Tensor): 129 | x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device) 130 | x = x.transpose(1, 0).unsqueeze(0) 131 | else: 132 | assert c is not None 133 | x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device) 134 | if c is not None: 135 | if not isinstance(c, torch.Tensor): 136 | c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) 137 | c = c.transpose(1, 0).unsqueeze(0) 138 | c = torch.nn.ReplicationPad1d(self.aux_context_window)(c) 139 | return self.forward(x, c).squeeze(0).transpose(1, 0) 140 | 141 | 142 | ''' 143 | to run this, fix 144 | from . import ResStack 145 | into 146 | from res_stack import ResStack 147 | ''' 148 | if __name__ == '__main__': 149 | ''' 150 | torch.Size([3, 80, 10]) 151 | torch.Size([3, 1, 2000]) 152 | 4527362 153 | ''' 154 | model = UnivNet() 155 | 156 | x = torch.randn(3, 64, 10) 157 | c = torch.randn(3, 80, 10) # (B, channels, T). 158 | print(c.shape) 159 | 160 | y = model(x, c) # (B, 1, T ** prod(upsample_scales) 161 | print(y.shape) 162 | assert y.shape == torch.Size([3, 1, 2560]) # For normal melgan torch.Size([3, 1, 2560]) 163 | 164 | pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 165 | print(pytorch_total_params) -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | loss += torch.mean(torch.abs(rl - gl)) 9 | 10 | return loss*2 11 | 12 | 13 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 14 | loss = 0 15 | r_losses = [] 16 | g_losses = [] 17 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 18 | r_loss = torch.mean((1-dr)**2) 19 | g_loss = torch.mean(dg**2) 20 | loss += (r_loss + g_loss) 21 | r_losses.append(r_loss.item()) 22 | g_losses.append(g_loss.item()) 23 | 24 | return loss, r_losses, g_losses 25 | 26 | 27 | def generator_loss(disc_outputs): 28 | loss = 0 29 | gen_losses = [] 30 | for dg in disc_outputs: 31 | l = torch.mean((1-dg)**2) 32 | gen_losses.append(l) 33 | loss += l 34 | 35 | return loss, gen_losses -------------------------------------------------------------------------------- /meldataset.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | import torch.utils.data 6 | import numpy as np 7 | from librosa.util import normalize 8 | from scipy.io.wavfile import read 9 | from librosa.filters import mel as librosa_mel_fn 10 | 11 | MAX_WAV_VALUE = 32768.0 12 | 13 | 14 | def load_wav(full_path): 15 | sampling_rate, data = read(full_path) 16 | return data, sampling_rate 17 | 18 | 19 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 20 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 21 | 22 | 23 | def dynamic_range_decompression(x, C=1): 24 | return np.exp(x) / C 25 | 26 | 27 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 28 | return torch.log(torch.clamp(x, min=clip_val) * C) 29 | 30 | 31 | def dynamic_range_decompression_torch(x, C=1): 32 | return torch.exp(x) / C 33 | 34 | 35 | def spectral_normalize_torch(magnitudes): 36 | output = dynamic_range_compression_torch(magnitudes) 37 | return output 38 | 39 | 40 | def spectral_de_normalize_torch(magnitudes): 41 | output = dynamic_range_decompression_torch(magnitudes) 42 | return output 43 | 44 | 45 | mel_basis = {} 46 | hann_window = {} 47 | 48 | 49 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 50 | if torch.min(y) < -1.: 51 | print('min value is ', torch.min(y)) 52 | if torch.max(y) > 1.: 53 | print('max value is ', torch.max(y)) 54 | 55 | global mel_basis, hann_window 56 | if fmax not in mel_basis: 57 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 58 | mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 59 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 60 | 61 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 62 | y = y.squeeze(1) 63 | 64 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 65 | center=center, pad_mode='reflect', normalized=False, onesided=True) 66 | 67 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 68 | 69 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 70 | spec = spectral_normalize_torch(spec) 71 | 72 | return spec 73 | 74 | 75 | def get_dataset_filelist(a): 76 | with open(a.input_training_file, 'r', encoding='utf-8') as fi: 77 | training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 78 | for x in fi.read().split('\n') if len(x) > 0] 79 | 80 | with open(a.input_validation_file, 'r', encoding='utf-8') as fi: 81 | validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 82 | for x in fi.read().split('\n') if len(x) > 0] 83 | return training_files, validation_files 84 | 85 | 86 | class MelDataset(torch.utils.data.Dataset): 87 | def __init__(self, training_files, segment_size, n_fft, num_mels, 88 | hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, 89 | device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): 90 | self.audio_files = training_files 91 | random.seed(1234) 92 | if shuffle: 93 | random.shuffle(self.audio_files) 94 | self.segment_size = segment_size 95 | self.sampling_rate = sampling_rate 96 | self.split = split 97 | self.n_fft = n_fft 98 | self.num_mels = num_mels 99 | self.hop_size = hop_size 100 | self.win_size = win_size 101 | self.fmin = fmin 102 | self.fmax = fmax 103 | self.fmax_loss = fmax_loss 104 | self.cached_wav = None 105 | self.n_cache_reuse = n_cache_reuse 106 | self._cache_ref_count = 0 107 | self.device = device 108 | self.fine_tuning = fine_tuning 109 | self.base_mels_path = base_mels_path 110 | 111 | def __getitem__(self, index): 112 | filename = self.audio_files[index] 113 | if self._cache_ref_count == 0: 114 | audio, sampling_rate = load_wav(filename) 115 | audio = audio / MAX_WAV_VALUE 116 | if not self.fine_tuning: 117 | audio = normalize(audio) * 0.95 118 | self.cached_wav = audio 119 | if sampling_rate != self.sampling_rate: 120 | raise ValueError("{} SR doesn't match target {} SR".format( 121 | sampling_rate, self.sampling_rate)) 122 | self._cache_ref_count = self.n_cache_reuse 123 | else: 124 | audio = self.cached_wav 125 | self._cache_ref_count -= 1 126 | 127 | audio = torch.FloatTensor(audio) 128 | audio = audio.unsqueeze(0) 129 | 130 | if not self.fine_tuning: 131 | if self.split: 132 | if audio.size(1) >= self.segment_size: 133 | max_audio_start = audio.size(1) - self.segment_size 134 | audio_start = random.randint(0, max_audio_start) 135 | audio = audio[:, audio_start:audio_start+self.segment_size] 136 | else: 137 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 138 | 139 | mel = mel_spectrogram(audio, self.n_fft, self.num_mels, 140 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, 141 | center=False) 142 | else: 143 | mel = np.load( 144 | os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) 145 | mel = torch.from_numpy(mel) 146 | 147 | if len(mel.shape) < 3: 148 | mel = mel.unsqueeze(0) 149 | 150 | if self.split: 151 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 152 | 153 | if audio.size(1) >= self.segment_size: 154 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 155 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 156 | audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] 157 | else: 158 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') 159 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 160 | 161 | mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, 162 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, 163 | center=False) 164 | noise = torch.randn([64, mel.shape[-1]]) 165 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze(), noise) 166 | 167 | def __len__(self): 168 | return len(self.audio_files) -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | class KernelPredictor(torch.nn.Module): 5 | ''' Kernel predictor for the location-variable convolutions 6 | ''' 7 | 8 | def __init__(self, 9 | cond_channels, 10 | conv_in_channels, 11 | conv_out_channels, 12 | conv_layers, 13 | conv_kernel_size=3, 14 | kpnet_hidden_channels=64, 15 | kpnet_conv_size=3, 16 | kpnet_dropout=0.0, 17 | kpnet_nonlinear_activation="LeakyReLU", 18 | kpnet_nonlinear_activation_params={"negative_slope": 0.1} 19 | ): 20 | ''' 21 | Args: 22 | cond_channels (int): number of channel for the conditioning sequence, 23 | conv_in_channels (int): number of channel for the input sequence, 24 | conv_out_channels (int): number of channel for the output sequence, 25 | conv_layers (int): 26 | kpnet_ 27 | ''' 28 | super().__init__() 29 | 30 | self.conv_in_channels = conv_in_channels 31 | self.conv_out_channels = conv_out_channels 32 | self.conv_kernel_size = conv_kernel_size 33 | self.conv_layers = conv_layers 34 | 35 | l_w = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers 36 | l_b = conv_out_channels * conv_layers 37 | 38 | padding = (kpnet_conv_size - 1) // 2 39 | self.input_conv = torch.nn.Sequential( 40 | torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=(5 - 1) // 2, bias=True), 41 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 42 | ) 43 | 44 | self.residual_conv = torch.nn.Sequential( 45 | torch.nn.Dropout(kpnet_dropout), 46 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 47 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 48 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 49 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 50 | torch.nn.Dropout(kpnet_dropout), 51 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 52 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 53 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 54 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 55 | torch.nn.Dropout(kpnet_dropout), 56 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 57 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 58 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 59 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 60 | ) 61 | 62 | self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_w, kpnet_conv_size, 63 | padding=padding, bias=True) 64 | self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_b, kpnet_conv_size, padding=padding, 65 | bias=True) 66 | 67 | def forward(self, c): 68 | ''' 69 | Args: 70 | c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 71 | Returns: 72 | ''' 73 | batch, cond_channels, cond_length = c.shape 74 | 75 | c = self.input_conv(c) 76 | c = c + self.residual_conv(c) 77 | k = self.kernel_conv(c) 78 | b = self.bias_conv(c) 79 | 80 | kernels = k.contiguous().view(batch, 81 | self.conv_layers, 82 | self.conv_in_channels, 83 | self.conv_out_channels, 84 | self.conv_kernel_size, 85 | cond_length) 86 | bias = b.contiguous().view(batch, 87 | self.conv_layers, 88 | self.conv_out_channels, 89 | cond_length) 90 | return kernels, bias 91 | 92 | 93 | class LVCBlock(torch.nn.Module): 94 | ''' the location-variable convolutions 95 | ''' 96 | 97 | def __init__(self, 98 | in_channels, 99 | cond_channels, 100 | upsample_ratio, 101 | conv_layers=4, 102 | conv_kernel_size=3, 103 | cond_hop_length=256, 104 | kpnet_hidden_channels=64, 105 | kpnet_conv_size=3, 106 | kpnet_dropout=0.0 107 | ): 108 | super().__init__() 109 | 110 | self.cond_hop_length = cond_hop_length 111 | self.conv_layers = conv_layers 112 | self.conv_kernel_size = conv_kernel_size 113 | self.convs = torch.nn.ModuleList() 114 | 115 | self.upsample = torch.nn.ConvTranspose1d(in_channels, in_channels, 116 | kernel_size=upsample_ratio*2, stride=upsample_ratio, 117 | padding=upsample_ratio // 2 + upsample_ratio % 2, 118 | output_padding=upsample_ratio % 2) 119 | 120 | 121 | self.kernel_predictor = KernelPredictor( 122 | cond_channels=cond_channels, 123 | conv_in_channels=in_channels, 124 | conv_out_channels=2 * in_channels, 125 | conv_layers=conv_layers, 126 | conv_kernel_size=conv_kernel_size, 127 | kpnet_hidden_channels=kpnet_hidden_channels, 128 | kpnet_conv_size=kpnet_conv_size, 129 | kpnet_dropout=kpnet_dropout 130 | ) 131 | 132 | 133 | for i in range(conv_layers): 134 | padding = (3 ** i) * int((conv_kernel_size - 1) / 2) 135 | conv = torch.nn.Conv1d(in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i) 136 | 137 | self.convs.append(conv) 138 | 139 | 140 | def forward(self, x, c): 141 | ''' forward propagation of the location-variable convolutions. 142 | Args: 143 | x (Tensor): the input sequence (batch, in_channels, in_length) 144 | c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 145 | 146 | Returns: 147 | Tensor: the output sequence (batch, in_channels, in_length) 148 | ''' 149 | batch, in_channels, in_length = x.shape 150 | 151 | 152 | kernels, bias = self.kernel_predictor(c) 153 | 154 | x = F.leaky_relu(x, 0.2) 155 | x = self.upsample(x) 156 | 157 | for i in range(self.conv_layers): 158 | y = F.leaky_relu(x, 0.2) 159 | y = self.convs[i](y) 160 | y = F.leaky_relu(y, 0.2) 161 | 162 | k = kernels[:, i, :, :, :, :] 163 | b = bias[:, i, :, :] 164 | y = self.location_variable_convolution(y, k, b, 1, self.cond_hop_length) 165 | x = x + torch.sigmoid(y[:, :in_channels, :]) * torch.tanh(y[:, in_channels:, :]) 166 | return x 167 | 168 | def location_variable_convolution(self, x, kernel, bias, dilation, hop_size): 169 | ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. 170 | Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. 171 | Args: 172 | x (Tensor): the input sequence (batch, in_channels, in_length). 173 | kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) 174 | bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) 175 | dilation (int): the dilation of convolution. 176 | hop_size (int): the hop_size of the conditioning sequence. 177 | Returns: 178 | (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). 179 | ''' 180 | batch, in_channels, in_length = x.shape 181 | batch, in_channels, out_channels, kernel_size, kernel_length = kernel.shape 182 | 183 | 184 | assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" 185 | 186 | padding = dilation * int((kernel_size - 1) / 2) 187 | x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding) 188 | x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) 189 | 190 | if hop_size < dilation: 191 | x = F.pad(x, (0, dilation), 'constant', 0) 192 | x = x.unfold(3, dilation, 193 | dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) 194 | x = x[:, :, :, :, :hop_size] 195 | x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) 196 | x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) 197 | 198 | o = torch.einsum('bildsk,biokl->bolsd', x, kernel) 199 | o = o + bias.unsqueeze(-1).unsqueeze(-1) 200 | o = o.contiguous().view(batch, out_channels, -1) 201 | return o 202 | -------------------------------------------------------------------------------- /stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def stft(x, fft_size, hop_size, win_length, window): 13 | """Perform STFT and convert to magnitude spectrogram. 14 | Args: 15 | x (Tensor): Input signal tensor (B, T). 16 | fft_size (int): FFT size. 17 | hop_size (int): Hop size. 18 | win_length (int): Window length. 19 | window (str): Window function type. 20 | Returns: 21 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 22 | """ 23 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window) 24 | real = x_stft[..., 0] 25 | imag = x_stft[..., 1] 26 | 27 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 28 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 29 | 30 | 31 | class SpectralConvergengeLoss(torch.nn.Module): 32 | """Spectral convergence loss module.""" 33 | 34 | def __init__(self): 35 | """Initilize spectral convergence loss module.""" 36 | super(SpectralConvergengeLoss, self).__init__() 37 | 38 | def forward(self, x_mag, y_mag): 39 | """Calculate forward propagation. 40 | Args: 41 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 42 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 43 | Returns: 44 | Tensor: Spectral convergence loss value. 45 | """ 46 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") 47 | 48 | 49 | class LogSTFTMagnitudeLoss(torch.nn.Module): 50 | """Log STFT magnitude loss module.""" 51 | 52 | def __init__(self): 53 | """Initilize los STFT magnitude loss module.""" 54 | super(LogSTFTMagnitudeLoss, self).__init__() 55 | 56 | def forward(self, x_mag, y_mag): 57 | """Calculate forward propagation. 58 | Args: 59 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 60 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 61 | Returns: 62 | Tensor: Log STFT magnitude loss value. 63 | """ 64 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 65 | 66 | 67 | class STFTLoss(torch.nn.Module): 68 | """STFT loss module.""" 69 | 70 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): 71 | """Initialize STFT loss module.""" 72 | super(STFTLoss, self).__init__() 73 | self.fft_size = fft_size 74 | self.shift_size = shift_size 75 | self.win_length = win_length 76 | self.window = getattr(torch, window)(win_length) 77 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 78 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 79 | 80 | def forward(self, x, y): 81 | """Calculate forward propagation. 82 | Args: 83 | x (Tensor): Predicted signal (B, T). 84 | y (Tensor): Groundtruth signal (B, T). 85 | Returns: 86 | Tensor: Spectral convergence loss value. 87 | Tensor: Log STFT magnitude loss value. 88 | """ 89 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device())) 90 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device())) 91 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 92 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 93 | 94 | return sc_loss, mag_loss 95 | 96 | 97 | class MultiResolutionSTFTLoss(torch.nn.Module): 98 | """Multi resolution STFT loss module.""" 99 | 100 | def __init__(self, 101 | fft_sizes=[1024, 2048, 512], 102 | hop_sizes=[120, 240, 50], 103 | win_lengths=[600, 1200, 240], 104 | window="hann_window"): 105 | """Initialize Multi resolution STFT loss module. 106 | Args: 107 | fft_sizes (list): List of FFT sizes. 108 | hop_sizes (list): List of hop sizes. 109 | win_lengths (list): List of window lengths. 110 | window (str): Window function type. 111 | """ 112 | super(MultiResolutionSTFTLoss, self).__init__() 113 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 114 | self.stft_losses = torch.nn.ModuleList() 115 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 116 | self.stft_losses += [STFTLoss(fs, ss, wl, window)] 117 | 118 | def forward(self, x, y): 119 | """Calculate forward propagation. 120 | Args: 121 | x (Tensor): Predicted signal (B, T). 122 | y (Tensor): Groundtruth signal (B, T). 123 | Returns: 124 | Tensor: Multi resolution spectral convergence loss value. 125 | Tensor: Multi resolution log STFT magnitude loss value. 126 | """ 127 | sc_loss = 0.0 128 | mag_loss = 0.0 129 | for f in self.stft_losses: 130 | sc_l, mag_l = f(x, y) 131 | sc_loss += sc_l 132 | mag_loss += mag_l 133 | sc_loss /= len(self.stft_losses) 134 | mag_loss /= len(self.stft_losses) 135 | 136 | return sc_loss, mag_loss -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | warnings.simplefilter(action='ignore', category=FutureWarning) 4 | import itertools 5 | import os 6 | import time 7 | import argparse 8 | import json 9 | import torch 10 | import torch.nn.functional as F 11 | from torch.utils.tensorboard import SummaryWriter 12 | from torch.utils.data import DistributedSampler, DataLoader 13 | import torch.multiprocessing as mp 14 | from torch.distributed import init_process_group 15 | from torch.nn.parallel import DistributedDataParallel 16 | from utils import AttrDict, build_env 17 | from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist 18 | from generator import UnivNet 19 | from discriminator import MultiPeriodDiscriminator, MultiResSpecDiscriminator 20 | from loss import feature_loss, generator_loss, discriminator_loss 21 | from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint 22 | from stft_loss import MultiResolutionSTFTLoss 23 | 24 | torch.backends.cudnn.benchmark = True 25 | 26 | 27 | def train(rank, a, h): 28 | if h.num_gpus > 1: 29 | init_process_group(backend=h.dist_config['dist_backend'], init_method=h.dist_config['dist_url'], 30 | world_size=h.dist_config['world_size'] * h.num_gpus, rank=rank) 31 | 32 | torch.cuda.manual_seed(h.seed) 33 | device = torch.device('cuda:{:d}'.format(rank)) 34 | 35 | generator = UnivNet(h).to(device) 36 | mpd = MultiPeriodDiscriminator().to(device) 37 | msd = MultiResSpecDiscriminator().to(device) 38 | 39 | if rank == 0: 40 | print(generator) 41 | os.makedirs(a.checkpoint_path, exist_ok=True) 42 | print("checkpoints directory : ", a.checkpoint_path) 43 | 44 | if os.path.isdir(a.checkpoint_path): 45 | cp_g = scan_checkpoint(a.checkpoint_path, 'g_') 46 | cp_do = scan_checkpoint(a.checkpoint_path, 'do_') 47 | 48 | steps = 0 49 | if cp_g is None or cp_do is None: 50 | state_dict_do = None 51 | last_epoch = -1 52 | else: 53 | state_dict_g = load_checkpoint(cp_g, device) 54 | state_dict_do = load_checkpoint(cp_do, device) 55 | generator.load_state_dict(state_dict_g['generator']) 56 | mpd.load_state_dict(state_dict_do['mpd']) 57 | msd.load_state_dict(state_dict_do['msd']) 58 | steps = state_dict_do['steps'] + 1 59 | last_epoch = state_dict_do['epoch'] 60 | 61 | if h.num_gpus > 1: 62 | generator = DistributedDataParallel(generator, device_ids=[rank]).to(device) 63 | mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) 64 | msd = DistributedDataParallel(msd, device_ids=[rank]).to(device) 65 | 66 | optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) 67 | optim_d = torch.optim.AdamW(itertools.chain(msd.parameters(), mpd.parameters()), 68 | h.learning_rate, betas=[h.adam_b1, h.adam_b2]) 69 | 70 | if state_dict_do is not None: 71 | optim_g.load_state_dict(state_dict_do['optim_g']) 72 | optim_d.load_state_dict(state_dict_do['optim_d']) 73 | 74 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch) 75 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch) 76 | 77 | training_filelist, validation_filelist = get_dataset_filelist(a) 78 | 79 | trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels, 80 | h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0, 81 | shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device, 82 | fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir) 83 | 84 | train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None 85 | 86 | train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False, 87 | sampler=train_sampler, 88 | batch_size=h.batch_size, 89 | pin_memory=True, 90 | drop_last=True) 91 | 92 | if rank == 0: 93 | validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels, 94 | h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0, 95 | fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning, 96 | base_mels_path=a.input_mels_dir) 97 | validation_loader = DataLoader(validset, num_workers=1, shuffle=False, 98 | sampler=None, 99 | batch_size=1, 100 | pin_memory=True, 101 | drop_last=True) 102 | 103 | sw = SummaryWriter(os.path.join(a.checkpoint_path, 'logs')) 104 | 105 | generator.train() 106 | mpd.train() 107 | msd.train() 108 | stft_loss = MultiResolutionSTFTLoss() 109 | for epoch in range(max(0, last_epoch), a.training_epochs): 110 | if rank == 0: 111 | start = time.time() 112 | print("Epoch: {}".format(epoch + 1)) 113 | 114 | if h.num_gpus > 1: 115 | train_sampler.set_epoch(epoch) 116 | 117 | for i, batch in enumerate(train_loader): 118 | if rank == 0: 119 | start_b = time.time() 120 | x, y, _, y_mel, z = batch 121 | x = torch.autograd.Variable(x.to(device, non_blocking=True)) 122 | y = torch.autograd.Variable(y.to(device, non_blocking=True)) 123 | y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) 124 | z = torch.autograd.Variable(z.to(device, non_blocking=True)) 125 | y = y.unsqueeze(1) 126 | 127 | y_g_hat = generator(z, x) 128 | 129 | y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, 130 | h.win_size, 131 | h.fmin, h.fmax_for_loss) 132 | 133 | if steps > h.disc_start_step: 134 | 135 | optim_d.zero_grad() 136 | 137 | # MPD 138 | y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) 139 | loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g) 140 | 141 | # MSD 142 | y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach()) 143 | loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g) 144 | 145 | loss_disc_all = loss_disc_s + loss_disc_f 146 | 147 | loss_disc_all.backward() 148 | optim_d.step() 149 | 150 | # Generator 151 | optim_g.zero_grad() 152 | 153 | # L1 Mel-Spectrogram Loss 154 | #loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45 155 | 156 | sc_loss, mag_loss = stft_loss(y_g_hat[:, :, :y.size(2)].squeeze(1), y.squeeze(1)) 157 | loss_mel = h.lambda_aux * (sc_loss + mag_loss) # STFT Loss 158 | 159 | if steps > h.disc_start_step: 160 | y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat) 161 | y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat) 162 | loss_fm_f = feature_loss(fmap_f_r, fmap_f_g) 163 | loss_fm_s = feature_loss(fmap_s_r, fmap_s_g) 164 | loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g) 165 | loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) 166 | loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel 167 | else: 168 | loss_gen_all = loss_mel 169 | 170 | loss_gen_all.backward() 171 | optim_g.step() 172 | 173 | if rank == 0: 174 | # STDOUT logging 175 | if steps % a.stdout_interval == 0: 176 | with torch.no_grad(): 177 | mel_error = F.l1_loss(y_mel, y_g_hat_mel).item() 178 | 179 | print('Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}'. 180 | format(steps, loss_gen_all, mel_error, time.time() - start_b)) 181 | 182 | # checkpointing 183 | if steps % a.checkpoint_interval == 0 and steps != 0: 184 | checkpoint_path = "{}/g_{:08d}".format(a.checkpoint_path, steps) 185 | save_checkpoint(checkpoint_path, 186 | {'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()}) 187 | checkpoint_path = "{}/do_{:08d}".format(a.checkpoint_path, steps) 188 | save_checkpoint(checkpoint_path, 189 | {'mpd': (mpd.module if h.num_gpus > 1 190 | else mpd).state_dict(), 191 | 'msd': (msd.module if h.num_gpus > 1 192 | else msd).state_dict(), 193 | 'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps, 194 | 'epoch': epoch}) 195 | 196 | # Tensorboard summary logging 197 | if steps % a.summary_interval == 0: 198 | sw.add_scalar("training/gen_loss_total", loss_gen_all, steps) 199 | sw.add_scalar("training/mel_spec_error", mel_error, steps) 200 | 201 | # Validation 202 | if steps % a.validation_interval == 0: # and steps != 0: 203 | generator.eval() 204 | torch.cuda.empty_cache() 205 | val_err_tot = 0 206 | with torch.no_grad(): 207 | for j, batch in enumerate(validation_loader): 208 | x, y, _, y_mel, z = batch 209 | y_g_hat = generator(z.to(device), x.to(device)) 210 | y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) 211 | y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, 212 | h.hop_size, h.win_size, 213 | h.fmin, h.fmax_for_loss) 214 | val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item() 215 | 216 | if j <= 4: 217 | if steps == 0: 218 | sw.add_audio('gt/y_{}'.format(j), y[0], steps, h.sampling_rate) 219 | sw.add_figure('gt/y_spec_{}'.format(j), plot_spectrogram(x[0]), steps) 220 | 221 | sw.add_audio('generated/y_hat_{}'.format(j), y_g_hat[0], steps, h.sampling_rate) 222 | y_hat_spec = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, 223 | h.sampling_rate, h.hop_size, h.win_size, 224 | h.fmin, h.fmax) 225 | sw.add_figure('generated/y_hat_spec_{}'.format(j), 226 | plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()), steps) 227 | 228 | val_err = val_err_tot / (j + 1) 229 | sw.add_scalar("validation/mel_spec_error", val_err, steps) 230 | 231 | generator.train() 232 | 233 | steps += 1 234 | 235 | scheduler_g.step() 236 | scheduler_d.step() 237 | 238 | if rank == 0: 239 | print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time() - start))) 240 | 241 | 242 | def main(): 243 | print('Initializing Training Process..') 244 | 245 | parser = argparse.ArgumentParser() 246 | 247 | parser.add_argument('--group_name', default=None) 248 | parser.add_argument('--input_wavs_dir', default='LJSpeech-1.1/wavs') 249 | parser.add_argument('--input_mels_dir', default='ft_dataset') 250 | parser.add_argument('--input_training_file', default='LJSpeech-1.1/training.txt') 251 | parser.add_argument('--input_validation_file', default='LJSpeech-1.1/validation.txt') 252 | parser.add_argument('--checkpoint_path', default='cp_hifigan') 253 | parser.add_argument('--config', default='') 254 | parser.add_argument('--training_epochs', default=3100, type=int) 255 | parser.add_argument('--stdout_interval', default=5, type=int) 256 | parser.add_argument('--checkpoint_interval', default=5000, type=int) 257 | parser.add_argument('--summary_interval', default=100, type=int) 258 | parser.add_argument('--validation_interval', default=1000, type=int) 259 | parser.add_argument('--fine_tuning', default=False, type=bool) 260 | 261 | a = parser.parse_args() 262 | 263 | with open(a.config) as f: 264 | data = f.read() 265 | 266 | json_config = json.loads(data) 267 | h = AttrDict(json_config) 268 | build_env(a.config, 'config_c32.json', a.checkpoint_path) 269 | 270 | torch.manual_seed(h.seed) 271 | if torch.cuda.is_available(): 272 | torch.cuda.manual_seed(h.seed) 273 | h.num_gpus = torch.cuda.device_count() 274 | h.batch_size = int(h.batch_size / h.num_gpus) 275 | print('Batch size per GPU :', h.batch_size) 276 | else: 277 | pass 278 | 279 | if h.num_gpus > 1: 280 | mp.spawn(train, nprocs=h.num_gpus, args=(a, h,)) 281 | else: 282 | train(0, a, h) 283 | 284 | 285 | if __name__ == '__main__': 286 | main() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import matplotlib 4 | import torch 5 | from torch.nn.utils import weight_norm 6 | matplotlib.use("Agg") 7 | import matplotlib.pylab as plt 8 | import shutil 9 | 10 | 11 | class AttrDict(dict): 12 | def __init__(self, *args, **kwargs): 13 | super(AttrDict, self).__init__(*args, **kwargs) 14 | self.__dict__ = self 15 | 16 | 17 | def build_env(config, config_name, path): 18 | t_path = os.path.join(path, config_name) 19 | if config != t_path: 20 | os.makedirs(path, exist_ok=True) 21 | shutil.copyfile(config, os.path.join(path, config_name)) 22 | 23 | 24 | def plot_spectrogram(spectrogram): 25 | fig, ax = plt.subplots(figsize=(10, 2)) 26 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 27 | interpolation='none') 28 | plt.colorbar(im, ax=ax) 29 | 30 | fig.canvas.draw() 31 | plt.close() 32 | 33 | return fig 34 | 35 | 36 | def init_weights(m, mean=0.0, std=0.01): 37 | classname = m.__class__.__name__ 38 | if classname.find("Conv") != -1: 39 | m.weight.data.normal_(mean, std) 40 | 41 | 42 | def apply_weight_norm(m): 43 | classname = m.__class__.__name__ 44 | if classname.find("Conv") != -1: 45 | weight_norm(m) 46 | 47 | 48 | def get_padding(kernel_size, dilation=1): 49 | return int((kernel_size*dilation - dilation)/2) 50 | 51 | 52 | def load_checkpoint(filepath, device): 53 | assert os.path.isfile(filepath) 54 | print("Loading '{}'".format(filepath)) 55 | checkpoint_dict = torch.load(filepath, map_location=device) 56 | print("Complete.") 57 | return checkpoint_dict 58 | 59 | 60 | def save_checkpoint(filepath, obj): 61 | print("Saving checkpoint to {}".format(filepath)) 62 | torch.save(obj, filepath) 63 | print("Complete.") 64 | 65 | 66 | def scan_checkpoint(cp_dir, prefix): 67 | pattern = os.path.join(cp_dir, prefix + '????????') 68 | cp_list = glob.glob(pattern) 69 | if len(cp_list) == 0: 70 | return None 71 | return sorted(cp_list)[-1] --------------------------------------------------------------------------------