├── .gitignore ├── LICENSE ├── LJSpeech-1.1 ├── training.txt └── validation.txt ├── README.md ├── config_v1.json ├── env.py ├── iSTFTnet.PNG ├── inference.py ├── inference_e2e.py ├── meldataset.py ├── models.py ├── requirements.txt ├── stft.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | .vs/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LJSpeech-1.1/validation.txt: -------------------------------------------------------------------------------- 1 | LJ050-0269|The essential terms of such memoranda might well be embodied in an Executive order.|The essential terms of such memoranda might well be embodied in an Executive order. 2 | LJ050-0270|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security. 3 | LJ050-0271|The demands on the President in the execution of His responsibilities in today's world are so varied and complex|The demands on the President in the execution of His responsibilities in today's world are so varied and complex 4 | LJ050-0272|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security. 5 | LJ050-0273|The Commission has, however, from its examination of the facts of President Kennedy's assassination|The Commission has, however, from its examination of the facts of President Kennedy's assassination 6 | LJ050-0274|made certain recommendations which it believes would, if adopted,|made certain recommendations which it believes would, if adopted, 7 | LJ050-0275|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger. 8 | LJ050-0276|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that, 9 | LJ050-0277|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President, 10 | LJ050-0278|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties. 11 | LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities. 12 | LJ001-0068|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic. 13 | LJ002-0149|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo. 14 | LJ002-0157|and Susannah Evans, in October the same year, for 2 shillings, with costs of 6 shillings, 8 pence.|and Susannah Evans, in October the same year, for two shillings, with costs of six shillings, eight pence. 15 | LJ002-0167|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny. 16 | LJ003-0042|The completion of this very necessary building was, however, much delayed for want of funds,|The completion of this very necessary building was, however, much delayed for want of funds, 17 | LJ003-0307|but as yet no suggestion was made to provide prison uniform.|but as yet no suggestion was made to provide prison uniform. 18 | LJ004-0169|On the dirty bedstead lay a wretched being in the throes of severe illness.|On the dirty bedstead lay a wretched being in the throes of severe illness. 19 | LJ004-0233|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating. 20 | LJ005-0101|whence it deduced the practice and condition of every prison that replied.|whence it deduced the practice and condition of every prison that replied. 21 | LJ005-0108|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison, 22 | LJ005-0202|An examination of this report shows how even the most insignificant township had its jail.|An examination of this report shows how even the most insignificant township had its jail. 23 | LJ005-0234|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer. 24 | LJ005-0248|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners. 25 | LJ006-0001|The Chronicles of Newgate, Volume 2. By Arthur Griffiths. Section 9: The first report of the inspector of prisons.|The Chronicles of Newgate, Volume two. By Arthur Griffiths. Section nine: The first report of the inspector of prisons. 26 | LJ006-0018|One was Mr. William Crawford, the other the Rev. Whitworth Russell.|One was Mr. William Crawford, the other the Rev. Whitworth Russell. 27 | LJ006-0034|They attended early and late; they mustered the prisoners, examined into their condition,|They attended early and late; they mustered the prisoners, examined into their condition, 28 | LJ006-0078|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner. 29 | LJ007-0217|They go on to say|They go on to say 30 | LJ007-0243|It was not till the erection of the new prison at Holloway in 1850, and the entire internal reconstruction of Newgate according to new ideas,|It was not till the erection of the new prison at Holloway in eighteen fifty, and the entire internal reconstruction of Newgate according to new ideas, 31 | LJ008-0087|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor. 32 | LJ008-0131|the other he kept between his hands.|the other he kept between his hands. 33 | LJ008-0140|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity, 34 | LJ008-0158|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion. 35 | LJ008-0174|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death. 36 | LJ010-0047|while in 1850 Her Majesty was the victim of another outrage at the hands of one Pate.|while in eighteen fifty Her Majesty was the victim of another outrage at the hands of one Pate. 37 | LJ010-0061|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt. 38 | LJ010-0105|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.|Thistlewood was discovered next morning in a mean house in White Street, Moorfields. 39 | LJ010-0233|Here again probably it was partly the love of notoriety which was the incentive,|Here again probably it was partly the love of notoriety which was the incentive, 40 | LJ010-0234|backed possibly with the hope that, as in a much more recent case,|backed possibly with the hope that, as in a much more recent case, 41 | LJ010-0258|As the Queen was driving from Buckingham Palace to the Chapel Royal,|As the Queen was driving from Buckingham Palace to the Chapel Royal, 42 | LJ010-0262|charged him with the offense.|charged him with the offense. 43 | LJ010-0270|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.|exactly tallied with that of the deformed person "wanted" for the assault on the Queen. 44 | LJ010-0293|I have already remarked that as violence was more and more eliminated from crimes against the person,|I have already remarked that as violence was more and more eliminated from crimes against the person, 45 | LJ011-0009|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell. 46 | LJ011-0256|By this time the neighbors were aroused, and several people came to the scene of the affray.|By this time the neighbors were aroused, and several people came to the scene of the affray. 47 | LJ012-0044|When his trade was busiest he set up a second establishment, at the head of which, although he was married,|When his trade was busiest he set up a second establishment, at the head of which, although he was married, 48 | LJ012-0145|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft. 49 | LJ013-0020|he acted in a manner which excited the suspicions of the crew.|he acted in a manner which excited the suspicions of the crew. 50 | LJ013-0077|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done. 51 | LJ013-0228|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for. 52 | LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood; 53 | LJ014-0054|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone. 54 | LJ014-0101|he found that it was soft and new, while elsewhere it was set and hard.|he found that it was soft and new, while elsewhere it was set and hard. 55 | LJ014-0103|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe. 56 | LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art. 57 | LJ014-0272|and 1850 to embezzle and apply to his own purposes some £71,000.|and eighteen fifty to embezzle and apply to his own purposes some seventy-one thousand pounds. 58 | LJ014-0311|His extensive business had been carried on by fraud.|His extensive business had been carried on by fraud. 59 | LJ015-0197|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force. 60 | LJ016-0089|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery. 61 | LJ016-0407|who generally attended the prison services.|who generally attended the prison services. 62 | LJ016-0443|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched. 63 | LJ017-0033|a medical practitioner, charged with doing to death persons who relied upon his professional skill.|a medical practitioner, charged with doing to death persons who relied upon his professional skill. 64 | LJ017-0038|That the administration of justice should never be interfered with by local prejudice or local feeling|That the administration of justice should never be interfered with by local prejudice or local feeling 65 | LJ018-0018|he wore gold-rimmed eye-glasses and a gold watch and chain.|he wore gold-rimmed eye-glasses and a gold watch and chain. 66 | LJ018-0119|His offer was not, however, accepted.|His offer was not, however, accepted. 67 | LJ018-0280|The commercial experience of these clever rogues was cosmopolitan.|The commercial experience of these clever rogues was cosmopolitan. 68 | LJ019-0178|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet. 69 | LJ019-0240|But no structural alterations were made from the date first quoted until the time of closing the prison in 1881.|But no structural alterations were made from the date first quoted until the time of closing the prison in eighteen eighty-one. 70 | LJ021-0049|and the curtailment of rank stock speculation through the Securities Exchange Act.|and the curtailment of rank stock speculation through the Securities Exchange Act. 71 | LJ021-0155|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works. 72 | LJ022-0046|It is true that while business and industry are definitely better our relief rolls are still too large.|It is true that while business and industry are definitely better our relief rolls are still too large. 73 | LJ022-0173|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport, 74 | LJ024-0087|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution. 75 | LJ024-0110|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay 76 | LJ024-0119|When before have you found them really at your side in your fights for progress?|When before have you found them really at your side in your fights for progress? 77 | LJ025-0091|as it was current among contemporary chemists.|as it was current among contemporary chemists. 78 | LJ026-0029|so in the case under discussion.|so in the case under discussion. 79 | LJ026-0039|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent. 80 | LJ026-0064|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food. 81 | LJ026-0105|This is done by diastase, an enzyme of plant cells.|This is done by diastase, an enzyme of plant cells. 82 | LJ026-0137|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.|and be laid down as "reserve starch" in the cells of root or stem or elsewhere. 83 | LJ027-0006|In all these lines the facts are drawn together by a strong thread of unity.|In all these lines the facts are drawn together by a strong thread of unity. 84 | LJ028-0134|He also erected what is called a pensile paradise:|He also erected what is called a pensile paradise: 85 | LJ028-0138|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be, 86 | LJ028-0189|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world. 87 | LJ028-0281|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken, 88 | LJ029-0188|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater. 89 | LJ030-0098|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen, 90 | LJ031-0007|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital. 91 | LJ031-0091|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries. 92 | LJ031-0227|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital, 93 | LJ032-0100|Marina Oswald|Marina Oswald 94 | LJ032-0165|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.|to the exclusion of all others because there are not enough microscopic characteristics present in fibers. 95 | LJ032-0198|During the period from March 2, 1963, to April 24, 1963,|During the period from March two, nineteen sixty-three, to April twenty-four, nineteen sixty-three, 96 | LJ033-0046|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so. 97 | LJ033-0072|I then stepped off of it and the officer picked it up in the middle and it bent so.|I then stepped off of it and the officer picked it up in the middle and it bent so. 98 | LJ033-0135|Location of Bag|Location of Bag 99 | LJ034-0083|The significance of Givens' observation that Oswald was carrying his clipboard|The significance of Givens' observation that Oswald was carrying his clipboard 100 | LJ034-0179|and, quote, seemed to be sitting a little forward, end quote,|and, quote, seemed to be sitting a little forward, end quote, 101 | LJ035-0125|Victoria Adams, who worked on the fourth floor of the Depository Building,|Victoria Adams, who worked on the fourth floor of the Depository Building, 102 | LJ035-0162|approximately 30 to 45 seconds after Oswald's lunchroom encounter with Baker and Truly.|approximately thirty to forty-five seconds after Oswald's lunchroom encounter with Baker and Truly. 103 | LJ035-0189|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade, 104 | LJ035-0208|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor 105 | LJ036-0216|Tippit got out and started to walk around the front of the car|Tippit got out and started to walk around the front of the car 106 | LJ037-0093|William Arthur Smith was about a block east of 10th and Patton when he heard shots.|William Arthur Smith was about a block east of tenth and Patton when he heard shots. 107 | LJ037-0157|taken from Oswald.|taken from Oswald. 108 | LJ037-0178|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting, 109 | LJ037-0219|Oswald's Jacket|Oswald's Jacket 110 | LJ037-0222|When Oswald was arrested, he did not have a jacket.|When Oswald was arrested, he did not have a jacket. 111 | LJ038-0017|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb. 112 | LJ038-0052|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight. 113 | LJ038-0077|Statements of Oswald during Detention.|Statements of Oswald during Detention. 114 | LJ038-0161|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm. 115 | LJ038-0212|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear. 116 | LJ039-0103|Oswald, like all Marine recruits, received training on the rifle range at distances up to 500 yards,|Oswald, like all Marine recruits, received training on the rifle range at distances up to five hundred yards, 117 | LJ039-0149|established that they had been previously loaded and ejected from the assassination rifle,|established that they had been previously loaded and ejected from the assassination rifle, 118 | LJ040-0107|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of 5 and 7 years,|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of five and seven years, 119 | LJ040-0119|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave. 120 | LJ040-0161|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic. 121 | LJ040-0169|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone 122 | LJ041-0098|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote. 123 | LJ042-0017|and see for himself how a revolutionary society operates, a Marxist society.|and see for himself how a revolutionary society operates, a Marxist society. 124 | LJ042-0070|Oswald was discovered in time to thwart his attempt at suicide.|Oswald was discovered in time to thwart his attempt at suicide. 125 | LJ042-0161|Immediately after serving out his 3 years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.|Immediately after serving out his three years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR. 126 | LJ043-0147|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle. 127 | LJ043-0178|as, in fact, one of them did appear after the assassination.|as, in fact, one of them did appear after the assassination. 128 | LJ043-0183|Oswald did not lack the determination and other traits required|Oswald did not lack the determination and other traits required 129 | LJ043-0185|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker. 130 | LJ044-0057|extensive investigation was not able to connect Oswald with that address, although it did develop the fact|extensive investigation was not able to connect Oswald with that address, although it did develop the fact 131 | LJ044-0109|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities. 132 | LJ045-0081|Although she denied it in some of her testimony before the Commission,|Although she denied it in some of her testimony before the Commission, 133 | LJ045-0147|She asked Oswald, quote,|She asked Oswald, quote, 134 | LJ045-0204|he had never found anything to which he felt he could really belong.|he had never found anything to which he felt he could really belong. 135 | LJ046-0193|and 12 to 15 of these cases as highly dangerous risks.|and twelve to fifteen of these cases as highly dangerous risks. 136 | LJ046-0244|PRS should have investigated and been prepared to guard against it.|PRS should have investigated and been prepared to guard against it. 137 | LJ047-0059|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries, 138 | LJ047-0142|The Bureau had no earlier information suggesting that Oswald had left the United States.|The Bureau had no earlier information suggesting that Oswald had left the United States. 139 | LJ048-0035|It was against this background and consistent with the criteria followed by the FBI prior to November 22|It was against this background and consistent with the criteria followed by the FBI prior to November twenty-two 140 | LJ048-0063|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination. 141 | LJ048-0104|There were far safer routes via freeways directly to the Trade Mart,|There were far safer routes via freeways directly to the Trade Mart, 142 | LJ048-0187|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators. 143 | LJ048-0271|will be cause for removal from the Service, end quote.|will be cause for removal from the Service, end quote. 144 | LJ049-0031|The Presidential vehicle in use in Dallas, described in chapter 2,|The Presidential vehicle in use in Dallas, described in chapter two, 145 | LJ049-0059|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger, 146 | LJ049-0174|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated 147 | LJ050-0049|and from a specialist in psychiatric prognostication at Walter Reed Hospital.|and from a specialist in psychiatric prognostication at Walter Reed Hospital. 148 | LJ050-0113|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service, 149 | LJ050-0150|Its present manual filing system is obsolete;|Its present manual filing system is obsolete; 150 | LJ050-0189|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described. 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # iSTFTNet : Fast and Lightweight Mel-spectrogram Vocoder Incorporating Inverse Short-time Fourier Transform 2 | This repo try to implement [iSTFTNet : Fast and Lightweight Mel-spectrogram Vocoder Incorporating Inverse Short-time Fourier Transform](https://arxiv.org/pdf/2203.02395.pdf) specifically model `C8C8I`. 3 | ![](iSTFTnet.PNG) 4 | `Disclaimer : This repo is build for testing purpose. The code is not optimized for performance.` 5 | ## Training : 6 | ``` 7 | python train.py --config config_v1.json 8 | ``` 9 | 10 | ## Note: 11 | * We are able to get good quality of audio with 30 % less training compared to original hifigan. 12 | * This model approx 60 % faster than counterpart hifigan. 13 | 14 | ## Citations : 15 | ``` 16 | @inproceedings{kaneko2022istftnet, 17 | title={{iSTFTNet}: Fast and Lightweight Mel-Spectrogram Vocoder Incorporating Inverse Short-Time Fourier Transform}, 18 | author={Takuhiro Kaneko and Kou Tanaka and Hirokazu Kameoka and Shogo Seki}, 19 | booktitle={ICASSP}, 20 | year={2022}, 21 | } 22 | ``` 23 | 24 | ## References: 25 | * https://github.com/jik876/hifi-gan 26 | -------------------------------------------------------------------------------- /config_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8], 12 | "upsample_kernel_sizes": [16,16], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | "gen_istft_n_fft": 16, 17 | "gen_istft_hop_size": 4, 18 | 19 | "segment_size": 8192, 20 | "num_mels": 80, 21 | "n_fft": 1024, 22 | "hop_size": 256, 23 | "win_size": 1024, 24 | 25 | "sampling_rate": 22050, 26 | 27 | "fmin": 0, 28 | "fmax": 8000, 29 | "fmax_for_loss": null, 30 | 31 | "num_workers": 4, 32 | 33 | "dist_config": { 34 | "dist_backend": "nccl", 35 | "dist_url": "tcp://localhost:54321", 36 | "world_size": 1 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | class AttrDict(dict): 6 | def __init__(self, *args, **kwargs): 7 | super(AttrDict, self).__init__(*args, **kwargs) 8 | self.__dict__ = self 9 | 10 | 11 | def build_env(config, config_name, path): 12 | t_path = os.path.join(path, config_name) 13 | if config != t_path: 14 | os.makedirs(path, exist_ok=True) 15 | shutil.copyfile(config, os.path.join(path, config_name)) 16 | -------------------------------------------------------------------------------- /iSTFTnet.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rishikksh20/iSTFTNet-pytorch/ecbf0f635b36432bd3e432790326591bc86cadbc/iSTFTnet.PNG -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import glob 4 | import os 5 | import argparse 6 | import json 7 | import torch 8 | from scipy.io.wavfile import write 9 | from env import AttrDict 10 | from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav 11 | from models import Generator 12 | from stft import TorchSTFT 13 | 14 | 15 | h = None 16 | device = None 17 | 18 | 19 | def load_checkpoint(filepath, device): 20 | assert os.path.isfile(filepath) 21 | print("Loading '{}'".format(filepath)) 22 | checkpoint_dict = torch.load(filepath, map_location=device) 23 | print("Complete.") 24 | return checkpoint_dict 25 | 26 | 27 | def get_mel(x): 28 | return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax) 29 | 30 | 31 | def scan_checkpoint(cp_dir, prefix): 32 | pattern = os.path.join(cp_dir, prefix + '*') 33 | cp_list = glob.glob(pattern) 34 | if len(cp_list) == 0: 35 | return '' 36 | return sorted(cp_list)[-1] 37 | 38 | 39 | def inference(a): 40 | generator = Generator(h).to(device) 41 | stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device) 42 | 43 | state_dict_g = load_checkpoint(a.checkpoint_file, device) 44 | generator.load_state_dict(state_dict_g['generator']) 45 | 46 | filelist = os.listdir(a.input_wavs_dir) 47 | 48 | os.makedirs(a.output_dir, exist_ok=True) 49 | 50 | generator.eval() 51 | generator.remove_weight_norm() 52 | with torch.no_grad(): 53 | for i, filname in enumerate(filelist): 54 | wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname)) 55 | wav = wav / MAX_WAV_VALUE 56 | wav = torch.FloatTensor(wav).to(device) 57 | x = get_mel(wav.unsqueeze(0)) 58 | spec, phase = generator(x) 59 | y_g_hat = stft.inverse(spec, phase) 60 | audio = y_g_hat.squeeze() 61 | audio = audio * MAX_WAV_VALUE 62 | audio = audio.cpu().numpy().astype('int16') 63 | 64 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav') 65 | write(output_file, h.sampling_rate, audio) 66 | print(output_file) 67 | 68 | 69 | def main(): 70 | print('Initializing Inference Process..') 71 | 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument('--input_wavs_dir', default='test_files') 74 | parser.add_argument('--output_dir', default='generated_files') 75 | parser.add_argument('--checkpoint_file', required=True) 76 | a = parser.parse_args() 77 | 78 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json') 79 | with open(config_file) as f: 80 | data = f.read() 81 | 82 | global h 83 | json_config = json.loads(data) 84 | h = AttrDict(json_config) 85 | 86 | torch.manual_seed(h.seed) 87 | global device 88 | if torch.cuda.is_available(): 89 | torch.cuda.manual_seed(h.seed) 90 | device = torch.device('cuda') 91 | else: 92 | device = torch.device('cpu') 93 | 94 | inference(a) 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | 100 | -------------------------------------------------------------------------------- /inference_e2e.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import glob 4 | import os 5 | import numpy as np 6 | import argparse 7 | import json 8 | import torch 9 | from scipy.io.wavfile import write 10 | from env import AttrDict 11 | from meldataset import MAX_WAV_VALUE 12 | from models import Generator 13 | from stft import TorchSTFT 14 | 15 | h = None 16 | device = None 17 | 18 | 19 | def load_checkpoint(filepath, device): 20 | assert os.path.isfile(filepath) 21 | print("Loading '{}'".format(filepath)) 22 | checkpoint_dict = torch.load(filepath, map_location=device) 23 | print("Complete.") 24 | return checkpoint_dict 25 | 26 | 27 | def scan_checkpoint(cp_dir, prefix): 28 | pattern = os.path.join(cp_dir, prefix + '*') 29 | cp_list = glob.glob(pattern) 30 | if len(cp_list) == 0: 31 | return '' 32 | return sorted(cp_list)[-1] 33 | 34 | 35 | def inference(a): 36 | generator = Generator(h).to(device) 37 | stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device) 38 | state_dict_g = load_checkpoint(a.checkpoint_file, device) 39 | generator.load_state_dict(state_dict_g['generator']) 40 | 41 | filelist = os.listdir(a.input_mels_dir) 42 | 43 | os.makedirs(a.output_dir, exist_ok=True) 44 | 45 | generator.eval() 46 | generator.remove_weight_norm() 47 | with torch.no_grad(): 48 | for i, filname in enumerate(filelist): 49 | x = np.load(os.path.join(a.input_mels_dir, filname)) 50 | x = torch.FloatTensor(x).to(device) 51 | spec, phase = generator(x) 52 | y_g_hat = stft.inverse(spec, phase) 53 | audio = y_g_hat.squeeze() 54 | audio = audio * MAX_WAV_VALUE 55 | audio = audio.cpu().numpy().astype('int16') 56 | 57 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav') 58 | write(output_file, h.sampling_rate, audio) 59 | print(output_file) 60 | 61 | 62 | def main(): 63 | print('Initializing Inference Process..') 64 | 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument('--input_mels_dir', default='test_mel_files') 67 | parser.add_argument('--output_dir', default='generated_files_from_mel') 68 | parser.add_argument('--checkpoint_file', required=True) 69 | a = parser.parse_args() 70 | 71 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json') 72 | with open(config_file) as f: 73 | data = f.read() 74 | 75 | global h 76 | json_config = json.loads(data) 77 | h = AttrDict(json_config) 78 | 79 | torch.manual_seed(h.seed) 80 | global device 81 | if torch.cuda.is_available(): 82 | torch.cuda.manual_seed(h.seed) 83 | device = torch.device('cuda') 84 | else: 85 | device = torch.device('cpu') 86 | 87 | inference(a) 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | 93 | -------------------------------------------------------------------------------- /meldataset.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | import torch.utils.data 6 | import numpy as np 7 | from librosa.util import normalize 8 | from scipy.io.wavfile import read 9 | from librosa.filters import mel as librosa_mel_fn 10 | 11 | MAX_WAV_VALUE = 32768.0 12 | 13 | 14 | def load_wav(full_path): 15 | sampling_rate, data = read(full_path) 16 | return data, sampling_rate 17 | 18 | 19 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 20 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 21 | 22 | 23 | def dynamic_range_decompression(x, C=1): 24 | return np.exp(x) / C 25 | 26 | 27 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 28 | return torch.log(torch.clamp(x, min=clip_val) * C) 29 | 30 | 31 | def dynamic_range_decompression_torch(x, C=1): 32 | return torch.exp(x) / C 33 | 34 | 35 | def spectral_normalize_torch(magnitudes): 36 | output = dynamic_range_compression_torch(magnitudes) 37 | return output 38 | 39 | 40 | def spectral_de_normalize_torch(magnitudes): 41 | output = dynamic_range_decompression_torch(magnitudes) 42 | return output 43 | 44 | 45 | mel_basis = {} 46 | hann_window = {} 47 | 48 | 49 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 50 | if torch.min(y) < -1.: 51 | print('min value is ', torch.min(y)) 52 | if torch.max(y) > 1.: 53 | print('max value is ', torch.max(y)) 54 | 55 | global mel_basis, hann_window 56 | if fmax not in mel_basis: 57 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 58 | mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 59 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 60 | 61 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 62 | y = y.squeeze(1) 63 | 64 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 65 | center=center, pad_mode='reflect', normalized=False, onesided=True) 66 | 67 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 68 | 69 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 70 | spec = spectral_normalize_torch(spec) 71 | 72 | return spec 73 | 74 | 75 | def get_dataset_filelist(a): 76 | with open(a.input_training_file, 'r', encoding='utf-8') as fi: 77 | training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 78 | for x in fi.read().split('\n') if len(x) > 0] 79 | 80 | with open(a.input_validation_file, 'r', encoding='utf-8') as fi: 81 | validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 82 | for x in fi.read().split('\n') if len(x) > 0] 83 | return training_files, validation_files 84 | 85 | 86 | class MelDataset(torch.utils.data.Dataset): 87 | def __init__(self, training_files, segment_size, n_fft, num_mels, 88 | hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, 89 | device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): 90 | self.audio_files = training_files 91 | random.seed(1234) 92 | if shuffle: 93 | random.shuffle(self.audio_files) 94 | self.segment_size = segment_size 95 | self.sampling_rate = sampling_rate 96 | self.split = split 97 | self.n_fft = n_fft 98 | self.num_mels = num_mels 99 | self.hop_size = hop_size 100 | self.win_size = win_size 101 | self.fmin = fmin 102 | self.fmax = fmax 103 | self.fmax_loss = fmax_loss 104 | self.cached_wav = None 105 | self.n_cache_reuse = n_cache_reuse 106 | self._cache_ref_count = 0 107 | self.device = device 108 | self.fine_tuning = fine_tuning 109 | self.base_mels_path = base_mels_path 110 | 111 | def __getitem__(self, index): 112 | filename = self.audio_files[index] 113 | if self._cache_ref_count == 0: 114 | audio, sampling_rate = load_wav(filename) 115 | audio = audio / MAX_WAV_VALUE 116 | if not self.fine_tuning: 117 | audio = normalize(audio) * 0.95 118 | self.cached_wav = audio 119 | if sampling_rate != self.sampling_rate: 120 | raise ValueError("{} SR doesn't match target {} SR".format( 121 | sampling_rate, self.sampling_rate)) 122 | self._cache_ref_count = self.n_cache_reuse 123 | else: 124 | audio = self.cached_wav 125 | self._cache_ref_count -= 1 126 | 127 | audio = torch.FloatTensor(audio) 128 | audio = audio.unsqueeze(0) 129 | 130 | if not self.fine_tuning: 131 | if self.split: 132 | if audio.size(1) >= self.segment_size: 133 | max_audio_start = audio.size(1) - self.segment_size 134 | audio_start = random.randint(0, max_audio_start) 135 | audio = audio[:, audio_start:audio_start+self.segment_size] 136 | else: 137 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 138 | 139 | mel = mel_spectrogram(audio, self.n_fft, self.num_mels, 140 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, 141 | center=False) 142 | else: 143 | mel = np.load( 144 | os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) 145 | mel = torch.from_numpy(mel) 146 | 147 | if len(mel.shape) < 3: 148 | mel = mel.unsqueeze(0) 149 | 150 | if self.split: 151 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 152 | 153 | if audio.size(1) >= self.segment_size: 154 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 155 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 156 | audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] 157 | else: 158 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') 159 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 160 | 161 | mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, 162 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, 163 | center=False) 164 | 165 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 166 | 167 | def __len__(self): 168 | return len(self.audio_files) 169 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 5 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 6 | from utils import init_weights, get_padding 7 | 8 | LRELU_SLOPE = 0.1 9 | 10 | 11 | class ResBlock1(torch.nn.Module): 12 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 13 | super(ResBlock1, self).__init__() 14 | self.h = h 15 | self.convs1 = nn.ModuleList([ 16 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 17 | padding=get_padding(kernel_size, dilation[0]))), 18 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 19 | padding=get_padding(kernel_size, dilation[1]))), 20 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 21 | padding=get_padding(kernel_size, dilation[2]))) 22 | ]) 23 | self.convs1.apply(init_weights) 24 | 25 | self.convs2 = nn.ModuleList([ 26 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 27 | padding=get_padding(kernel_size, 1))), 28 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 29 | padding=get_padding(kernel_size, 1))), 30 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 31 | padding=get_padding(kernel_size, 1))) 32 | ]) 33 | self.convs2.apply(init_weights) 34 | 35 | def forward(self, x): 36 | for c1, c2 in zip(self.convs1, self.convs2): 37 | xt = F.leaky_relu(x, LRELU_SLOPE) 38 | xt = c1(xt) 39 | xt = F.leaky_relu(xt, LRELU_SLOPE) 40 | xt = c2(xt) 41 | x = xt + x 42 | return x 43 | 44 | def remove_weight_norm(self): 45 | for l in self.convs1: 46 | remove_weight_norm(l) 47 | for l in self.convs2: 48 | remove_weight_norm(l) 49 | 50 | 51 | class ResBlock2(torch.nn.Module): 52 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): 53 | super(ResBlock2, self).__init__() 54 | self.h = h 55 | self.convs = nn.ModuleList([ 56 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 57 | padding=get_padding(kernel_size, dilation[0]))), 58 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 59 | padding=get_padding(kernel_size, dilation[1]))) 60 | ]) 61 | self.convs.apply(init_weights) 62 | 63 | def forward(self, x): 64 | for c in self.convs: 65 | xt = F.leaky_relu(x, LRELU_SLOPE) 66 | xt = c(xt) 67 | x = xt + x 68 | return x 69 | 70 | def remove_weight_norm(self): 71 | for l in self.convs: 72 | remove_weight_norm(l) 73 | 74 | 75 | class Generator(torch.nn.Module): 76 | def __init__(self, h): 77 | super(Generator, self).__init__() 78 | self.h = h 79 | self.num_kernels = len(h.resblock_kernel_sizes) 80 | self.num_upsamples = len(h.upsample_rates) 81 | self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) 82 | resblock = ResBlock1 if h.resblock == '1' else ResBlock2 83 | 84 | self.ups = nn.ModuleList() 85 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): 86 | self.ups.append(weight_norm( 87 | ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), 88 | k, u, padding=(k-u)//2))) 89 | 90 | self.resblocks = nn.ModuleList() 91 | for i in range(len(self.ups)): 92 | ch = h.upsample_initial_channel//(2**(i+1)) 93 | for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): 94 | self.resblocks.append(resblock(h, ch, k, d)) 95 | 96 | self.post_n_fft = h.gen_istft_n_fft 97 | self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3)) 98 | self.ups.apply(init_weights) 99 | self.conv_post.apply(init_weights) 100 | self.reflection_pad = torch.nn.ReflectionPad1d((1, 0)) 101 | 102 | def forward(self, x): 103 | x = self.conv_pre(x) 104 | for i in range(self.num_upsamples): 105 | x = F.leaky_relu(x, LRELU_SLOPE) 106 | x = self.ups[i](x) 107 | xs = None 108 | for j in range(self.num_kernels): 109 | if xs is None: 110 | xs = self.resblocks[i*self.num_kernels+j](x) 111 | else: 112 | xs += self.resblocks[i*self.num_kernels+j](x) 113 | x = xs / self.num_kernels 114 | x = F.leaky_relu(x) 115 | x = self.reflection_pad(x) 116 | x = self.conv_post(x) 117 | spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :]) 118 | phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :]) 119 | 120 | return spec, phase 121 | 122 | def remove_weight_norm(self): 123 | print('Removing weight norm...') 124 | for l in self.ups: 125 | remove_weight_norm(l) 126 | for l in self.resblocks: 127 | l.remove_weight_norm() 128 | remove_weight_norm(self.conv_pre) 129 | remove_weight_norm(self.conv_post) 130 | 131 | 132 | class DiscriminatorP(torch.nn.Module): 133 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 134 | super(DiscriminatorP, self).__init__() 135 | self.period = period 136 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 137 | self.convs = nn.ModuleList([ 138 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 139 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 140 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 141 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 142 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), 143 | ]) 144 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 145 | 146 | def forward(self, x): 147 | fmap = [] 148 | 149 | # 1d to 2d 150 | b, c, t = x.shape 151 | if t % self.period != 0: # pad first 152 | n_pad = self.period - (t % self.period) 153 | x = F.pad(x, (0, n_pad), "reflect") 154 | t = t + n_pad 155 | x = x.view(b, c, t // self.period, self.period) 156 | 157 | for l in self.convs: 158 | x = l(x) 159 | x = F.leaky_relu(x, LRELU_SLOPE) 160 | fmap.append(x) 161 | x = self.conv_post(x) 162 | fmap.append(x) 163 | x = torch.flatten(x, 1, -1) 164 | 165 | return x, fmap 166 | 167 | 168 | class MultiPeriodDiscriminator(torch.nn.Module): 169 | def __init__(self): 170 | super(MultiPeriodDiscriminator, self).__init__() 171 | self.discriminators = nn.ModuleList([ 172 | DiscriminatorP(2), 173 | DiscriminatorP(3), 174 | DiscriminatorP(5), 175 | DiscriminatorP(7), 176 | DiscriminatorP(11), 177 | ]) 178 | 179 | def forward(self, y, y_hat): 180 | y_d_rs = [] 181 | y_d_gs = [] 182 | fmap_rs = [] 183 | fmap_gs = [] 184 | for i, d in enumerate(self.discriminators): 185 | y_d_r, fmap_r = d(y) 186 | y_d_g, fmap_g = d(y_hat) 187 | y_d_rs.append(y_d_r) 188 | fmap_rs.append(fmap_r) 189 | y_d_gs.append(y_d_g) 190 | fmap_gs.append(fmap_g) 191 | 192 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 193 | 194 | 195 | class DiscriminatorS(torch.nn.Module): 196 | def __init__(self, use_spectral_norm=False): 197 | super(DiscriminatorS, self).__init__() 198 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 199 | self.convs = nn.ModuleList([ 200 | norm_f(Conv1d(1, 128, 15, 1, padding=7)), 201 | norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), 202 | norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), 203 | norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), 204 | norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), 205 | norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), 206 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 207 | ]) 208 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 209 | 210 | def forward(self, x): 211 | fmap = [] 212 | for l in self.convs: 213 | x = l(x) 214 | x = F.leaky_relu(x, LRELU_SLOPE) 215 | fmap.append(x) 216 | x = self.conv_post(x) 217 | fmap.append(x) 218 | x = torch.flatten(x, 1, -1) 219 | 220 | return x, fmap 221 | 222 | 223 | class MultiScaleDiscriminator(torch.nn.Module): 224 | def __init__(self): 225 | super(MultiScaleDiscriminator, self).__init__() 226 | self.discriminators = nn.ModuleList([ 227 | DiscriminatorS(use_spectral_norm=True), 228 | DiscriminatorS(), 229 | DiscriminatorS(), 230 | ]) 231 | self.meanpools = nn.ModuleList([ 232 | AvgPool1d(4, 2, padding=2), 233 | AvgPool1d(4, 2, padding=2) 234 | ]) 235 | 236 | def forward(self, y, y_hat): 237 | y_d_rs = [] 238 | y_d_gs = [] 239 | fmap_rs = [] 240 | fmap_gs = [] 241 | for i, d in enumerate(self.discriminators): 242 | if i != 0: 243 | y = self.meanpools[i-1](y) 244 | y_hat = self.meanpools[i-1](y_hat) 245 | y_d_r, fmap_r = d(y) 246 | y_d_g, fmap_g = d(y_hat) 247 | y_d_rs.append(y_d_r) 248 | fmap_rs.append(fmap_r) 249 | y_d_gs.append(y_d_g) 250 | fmap_gs.append(fmap_g) 251 | 252 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 253 | 254 | 255 | def feature_loss(fmap_r, fmap_g): 256 | loss = 0 257 | for dr, dg in zip(fmap_r, fmap_g): 258 | for rl, gl in zip(dr, dg): 259 | loss += torch.mean(torch.abs(rl - gl)) 260 | 261 | return loss*2 262 | 263 | 264 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 265 | loss = 0 266 | r_losses = [] 267 | g_losses = [] 268 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 269 | r_loss = torch.mean((1-dr)**2) 270 | g_loss = torch.mean(dg**2) 271 | loss += (r_loss + g_loss) 272 | r_losses.append(r_loss.item()) 273 | g_losses.append(g_loss.item()) 274 | 275 | return loss, r_losses, g_losses 276 | 277 | 278 | def generator_loss(disc_outputs): 279 | loss = 0 280 | gen_losses = [] 281 | for dg in disc_outputs: 282 | l = torch.mean((1-dg)**2) 283 | gen_losses.append(l) 284 | loss += l 285 | 286 | return loss, gen_losses 287 | 288 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | numpy==1.17.4 3 | librosa==0.7.2 4 | scipy==1.4.1 5 | tensorboard==2.0 6 | soundfile==0.10.3.post1 7 | matplotlib==3.1.3 -------------------------------------------------------------------------------- /stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | Copyright (c) 2017, Prem Seetharaman 4 | All rights reserved. 5 | * Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, this 10 | list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the copyright holder nor the names of its 13 | contributors may be used to endorse or promote products derived from this 14 | software without specific prior written permission. 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import torch 28 | import numpy as np 29 | import torch.nn.functional as F 30 | from torch.autograd import Variable 31 | from scipy.signal import get_window 32 | from librosa.util import pad_center, tiny 33 | import librosa.util as librosa_util 34 | 35 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 36 | n_fft=800, dtype=np.float32, norm=None): 37 | """ 38 | # from librosa 0.6 39 | Compute the sum-square envelope of a window function at a given hop length. 40 | This is used to estimate modulation effects induced by windowing 41 | observations in short-time fourier transforms. 42 | Parameters 43 | ---------- 44 | window : string, tuple, number, callable, or list-like 45 | Window specification, as in `get_window` 46 | n_frames : int > 0 47 | The number of analysis frames 48 | hop_length : int > 0 49 | The number of samples to advance between frames 50 | win_length : [optional] 51 | The length of the window function. By default, this matches `n_fft`. 52 | n_fft : int > 0 53 | The length of each analysis frame. 54 | dtype : np.dtype 55 | The data type of the output 56 | Returns 57 | ------- 58 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 59 | The sum-squared envelope of the window function 60 | """ 61 | if win_length is None: 62 | win_length = n_fft 63 | 64 | n = n_fft + hop_length * (n_frames - 1) 65 | x = np.zeros(n, dtype=dtype) 66 | 67 | # Compute the squared window at the desired length 68 | win_sq = get_window(window, win_length, fftbins=True) 69 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 70 | win_sq = librosa_util.pad_center(win_sq, n_fft) 71 | 72 | # Fill the envelope 73 | for i in range(n_frames): 74 | sample = i * hop_length 75 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 76 | return x 77 | 78 | 79 | class STFT(torch.nn.Module): 80 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 81 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 82 | window='hann'): 83 | super(STFT, self).__init__() 84 | self.filter_length = filter_length 85 | self.hop_length = hop_length 86 | self.win_length = win_length 87 | self.window = window 88 | self.forward_transform = None 89 | scale = self.filter_length / self.hop_length 90 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 91 | 92 | cutoff = int((self.filter_length / 2 + 1)) 93 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 94 | np.imag(fourier_basis[:cutoff, :])]) 95 | 96 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 97 | inverse_basis = torch.FloatTensor( 98 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 99 | 100 | if window is not None: 101 | assert(filter_length >= win_length) 102 | # get window and zero center pad it to filter_length 103 | fft_window = get_window(window, win_length, fftbins=True) 104 | fft_window = pad_center(fft_window, filter_length) 105 | fft_window = torch.from_numpy(fft_window).float() 106 | 107 | # window the bases 108 | forward_basis *= fft_window 109 | inverse_basis *= fft_window 110 | 111 | self.register_buffer('forward_basis', forward_basis.float()) 112 | self.register_buffer('inverse_basis', inverse_basis.float()) 113 | 114 | def transform(self, input_data): 115 | num_batches = input_data.size(0) 116 | num_samples = input_data.size(1) 117 | 118 | self.num_samples = num_samples 119 | 120 | # similar to librosa, reflect-pad the input 121 | input_data = input_data.view(num_batches, 1, num_samples) 122 | input_data = F.pad( 123 | input_data.unsqueeze(1), 124 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 125 | mode='reflect') 126 | input_data = input_data.squeeze(1) 127 | 128 | forward_transform = F.conv1d( 129 | input_data, 130 | Variable(self.forward_basis, requires_grad=False), 131 | stride=self.hop_length, 132 | padding=0) 133 | 134 | cutoff = int((self.filter_length / 2) + 1) 135 | real_part = forward_transform[:, :cutoff, :] 136 | imag_part = forward_transform[:, cutoff:, :] 137 | 138 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 139 | phase = torch.autograd.Variable( 140 | torch.atan2(imag_part.data, real_part.data)) 141 | 142 | return magnitude, phase 143 | 144 | def inverse(self, magnitude, phase): 145 | recombine_magnitude_phase = torch.cat( 146 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 147 | 148 | inverse_transform = F.conv_transpose1d( 149 | recombine_magnitude_phase, 150 | Variable(self.inverse_basis, requires_grad=False), 151 | stride=self.hop_length, 152 | padding=0) 153 | 154 | if self.window is not None: 155 | window_sum = window_sumsquare( 156 | self.window, magnitude.size(-1), hop_length=self.hop_length, 157 | win_length=self.win_length, n_fft=self.filter_length, 158 | dtype=np.float32) 159 | # remove modulation effects 160 | approx_nonzero_indices = torch.from_numpy( 161 | np.where(window_sum > tiny(window_sum))[0]) 162 | window_sum = torch.autograd.Variable( 163 | torch.from_numpy(window_sum), requires_grad=False) 164 | window_sum = window_sum.to(inverse_transform.device()) if magnitude.is_cuda else window_sum 165 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 166 | 167 | # scale by hop ratio 168 | inverse_transform *= float(self.filter_length) / self.hop_length 169 | 170 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 171 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 172 | 173 | return inverse_transform 174 | 175 | def forward(self, input_data): 176 | self.magnitude, self.phase = self.transform(input_data) 177 | reconstruction = self.inverse(self.magnitude, self.phase) 178 | return reconstruction 179 | 180 | 181 | class TorchSTFT(torch.nn.Module): 182 | def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'): 183 | super().__init__() 184 | self.filter_length = filter_length 185 | self.hop_length = hop_length 186 | self.win_length = win_length 187 | self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32)) 188 | 189 | def transform(self, input_data): 190 | forward_transform = torch.stft( 191 | input_data, 192 | self.filter_length, self.hop_length, self.win_length, window=self.window, 193 | return_complex=True) 194 | 195 | return torch.abs(forward_transform), torch.angle(forward_transform) 196 | 197 | def inverse(self, magnitude, phase): 198 | inverse_transform = torch.istft( 199 | magnitude * torch.exp(phase * 1j), 200 | self.filter_length, self.hop_length, self.win_length, window=self.window) 201 | 202 | return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation 203 | 204 | def forward(self, input_data): 205 | self.magnitude, self.phase = self.transform(input_data) 206 | reconstruction = self.inverse(self.magnitude, self.phase) 207 | return reconstruction 208 | 209 | 210 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.simplefilter(action='ignore', category=FutureWarning) 3 | import itertools 4 | import os 5 | import time 6 | import argparse 7 | import json 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.utils.tensorboard import SummaryWriter 11 | from torch.utils.data import DistributedSampler, DataLoader 12 | import torch.multiprocessing as mp 13 | from torch.distributed import init_process_group 14 | from torch.nn.parallel import DistributedDataParallel 15 | from env import AttrDict, build_env 16 | from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist 17 | from models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\ 18 | discriminator_loss 19 | from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint 20 | from stft import TorchSTFT 21 | 22 | torch.backends.cudnn.benchmark = True 23 | 24 | 25 | def train(rank, a, h): 26 | if h.num_gpus > 1: 27 | init_process_group(backend=h.dist_config['dist_backend'], init_method=h.dist_config['dist_url'], 28 | world_size=h.dist_config['world_size'] * h.num_gpus, rank=rank) 29 | 30 | torch.cuda.manual_seed(h.seed) 31 | device = torch.device('cuda:{:d}'.format(rank)) 32 | 33 | generator = Generator(h).to(device) 34 | mpd = MultiPeriodDiscriminator().to(device) 35 | msd = MultiScaleDiscriminator().to(device) 36 | stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device) 37 | 38 | if rank == 0: 39 | print(generator) 40 | os.makedirs(a.checkpoint_path, exist_ok=True) 41 | print("checkpoints directory : ", a.checkpoint_path) 42 | 43 | if os.path.isdir(a.checkpoint_path): 44 | cp_g = scan_checkpoint(a.checkpoint_path, 'g_') 45 | cp_do = scan_checkpoint(a.checkpoint_path, 'do_') 46 | 47 | steps = 0 48 | if cp_g is None or cp_do is None: 49 | state_dict_do = None 50 | last_epoch = -1 51 | else: 52 | state_dict_g = load_checkpoint(cp_g, device) 53 | state_dict_do = load_checkpoint(cp_do, device) 54 | generator.load_state_dict(state_dict_g['generator']) 55 | mpd.load_state_dict(state_dict_do['mpd']) 56 | msd.load_state_dict(state_dict_do['msd']) 57 | steps = state_dict_do['steps'] + 1 58 | last_epoch = state_dict_do['epoch'] 59 | 60 | if h.num_gpus > 1: 61 | generator = DistributedDataParallel(generator, device_ids=[rank]).to(device) 62 | mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) 63 | msd = DistributedDataParallel(msd, device_ids=[rank]).to(device) 64 | 65 | optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) 66 | optim_d = torch.optim.AdamW(itertools.chain(msd.parameters(), mpd.parameters()), 67 | h.learning_rate, betas=[h.adam_b1, h.adam_b2]) 68 | 69 | if state_dict_do is not None: 70 | optim_g.load_state_dict(state_dict_do['optim_g']) 71 | optim_d.load_state_dict(state_dict_do['optim_d']) 72 | 73 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch) 74 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch) 75 | 76 | training_filelist, validation_filelist = get_dataset_filelist(a) 77 | 78 | trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels, 79 | h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0, 80 | shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device, 81 | fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir) 82 | 83 | train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None 84 | 85 | train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False, 86 | sampler=train_sampler, 87 | batch_size=h.batch_size, 88 | pin_memory=True, 89 | drop_last=True) 90 | 91 | if rank == 0: 92 | validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels, 93 | h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0, 94 | fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning, 95 | base_mels_path=a.input_mels_dir) 96 | validation_loader = DataLoader(validset, num_workers=1, shuffle=False, 97 | sampler=None, 98 | batch_size=1, 99 | pin_memory=True, 100 | drop_last=True) 101 | 102 | sw = SummaryWriter(os.path.join(a.checkpoint_path, 'logs')) 103 | 104 | generator.train() 105 | mpd.train() 106 | msd.train() 107 | for epoch in range(max(0, last_epoch), a.training_epochs): 108 | if rank == 0: 109 | start = time.time() 110 | print("Epoch: {}".format(epoch+1)) 111 | 112 | if h.num_gpus > 1: 113 | train_sampler.set_epoch(epoch) 114 | 115 | for i, batch in enumerate(train_loader): 116 | if rank == 0: 117 | start_b = time.time() 118 | x, y, _, y_mel = batch 119 | x = torch.autograd.Variable(x.to(device, non_blocking=True)) 120 | y = torch.autograd.Variable(y.to(device, non_blocking=True)) 121 | y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) 122 | y = y.unsqueeze(1) 123 | # y_g_hat = generator(x) 124 | spec, phase = generator(x) 125 | 126 | y_g_hat = stft.inverse(spec, phase) 127 | 128 | y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, 129 | h.fmin, h.fmax_for_loss) 130 | 131 | optim_d.zero_grad() 132 | 133 | # MPD 134 | y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) 135 | loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g) 136 | 137 | # MSD 138 | y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach()) 139 | loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g) 140 | 141 | loss_disc_all = loss_disc_s + loss_disc_f 142 | 143 | loss_disc_all.backward() 144 | optim_d.step() 145 | 146 | # Generator 147 | optim_g.zero_grad() 148 | 149 | # L1 Mel-Spectrogram Loss 150 | loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45 151 | 152 | y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat) 153 | y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat) 154 | loss_fm_f = feature_loss(fmap_f_r, fmap_f_g) 155 | loss_fm_s = feature_loss(fmap_s_r, fmap_s_g) 156 | loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g) 157 | loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) 158 | loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel 159 | 160 | loss_gen_all.backward() 161 | optim_g.step() 162 | 163 | if rank == 0: 164 | # STDOUT logging 165 | if steps % a.stdout_interval == 0: 166 | with torch.no_grad(): 167 | mel_error = F.l1_loss(y_mel, y_g_hat_mel).item() 168 | 169 | print('Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}'. 170 | format(steps, loss_gen_all, mel_error, time.time() - start_b)) 171 | 172 | # checkpointing 173 | if steps % a.checkpoint_interval == 0 and steps != 0: 174 | checkpoint_path = "{}/g_{:08d}".format(a.checkpoint_path, steps) 175 | save_checkpoint(checkpoint_path, 176 | {'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()}) 177 | checkpoint_path = "{}/do_{:08d}".format(a.checkpoint_path, steps) 178 | save_checkpoint(checkpoint_path, 179 | {'mpd': (mpd.module if h.num_gpus > 1 180 | else mpd).state_dict(), 181 | 'msd': (msd.module if h.num_gpus > 1 182 | else msd).state_dict(), 183 | 'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps, 184 | 'epoch': epoch}) 185 | 186 | # Tensorboard summary logging 187 | if steps % a.summary_interval == 0: 188 | sw.add_scalar("training/gen_loss_total", loss_gen_all, steps) 189 | sw.add_scalar("training/mel_spec_error", mel_error, steps) 190 | 191 | # Validation 192 | if steps % a.validation_interval == 0: # and steps != 0: 193 | generator.eval() 194 | torch.cuda.empty_cache() 195 | val_err_tot = 0 196 | with torch.no_grad(): 197 | for j, batch in enumerate(validation_loader): 198 | x, y, _, y_mel = batch 199 | # y_g_hat = generator(x.to(device)) 200 | spec, phase = generator(x.to(device)) 201 | 202 | y_g_hat = stft.inverse(spec, phase) 203 | 204 | y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) 205 | y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, 206 | h.hop_size, h.win_size, 207 | h.fmin, h.fmax_for_loss) 208 | val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item() 209 | 210 | if j <= 4: 211 | if steps == 0: 212 | sw.add_audio('gt/y_{}'.format(j), y[0], steps, h.sampling_rate) 213 | sw.add_figure('gt/y_spec_{}'.format(j), plot_spectrogram(x[0]), steps) 214 | 215 | sw.add_audio('generated/y_hat_{}'.format(j), y_g_hat[0], steps, h.sampling_rate) 216 | y_hat_spec = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, 217 | h.sampling_rate, h.hop_size, h.win_size, 218 | h.fmin, h.fmax) 219 | sw.add_figure('generated/y_hat_spec_{}'.format(j), 220 | plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()), steps) 221 | 222 | val_err = val_err_tot / (j+1) 223 | sw.add_scalar("validation/mel_spec_error", val_err, steps) 224 | 225 | generator.train() 226 | 227 | steps += 1 228 | 229 | scheduler_g.step() 230 | scheduler_d.step() 231 | 232 | if rank == 0: 233 | print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time() - start))) 234 | 235 | 236 | def main(): 237 | print('Initializing Training Process..') 238 | 239 | parser = argparse.ArgumentParser() 240 | 241 | parser.add_argument('--group_name', default=None) 242 | parser.add_argument('--input_wavs_dir', default='LJSpeech-1.1/wavs') 243 | parser.add_argument('--input_mels_dir', default='ft_dataset') 244 | parser.add_argument('--input_training_file', default='LJSpeech-1.1/training.txt') 245 | parser.add_argument('--input_validation_file', default='LJSpeech-1.1/validation.txt') 246 | parser.add_argument('--checkpoint_path', default='cp_hifigan') 247 | parser.add_argument('--config', default='') 248 | parser.add_argument('--training_epochs', default=3100, type=int) 249 | parser.add_argument('--stdout_interval', default=5, type=int) 250 | parser.add_argument('--checkpoint_interval', default=5000, type=int) 251 | parser.add_argument('--summary_interval', default=100, type=int) 252 | parser.add_argument('--validation_interval', default=1000, type=int) 253 | parser.add_argument('--fine_tuning', default=False, type=bool) 254 | 255 | a = parser.parse_args() 256 | 257 | with open(a.config) as f: 258 | data = f.read() 259 | 260 | json_config = json.loads(data) 261 | h = AttrDict(json_config) 262 | build_env(a.config, 'config.json', a.checkpoint_path) 263 | 264 | torch.manual_seed(h.seed) 265 | if torch.cuda.is_available(): 266 | torch.cuda.manual_seed(h.seed) 267 | h.num_gpus = torch.cuda.device_count() 268 | h.batch_size = int(h.batch_size / h.num_gpus) 269 | print('Batch size per GPU :', h.batch_size) 270 | else: 271 | pass 272 | 273 | if h.num_gpus > 1: 274 | mp.spawn(train, nprocs=h.num_gpus, args=(a, h,)) 275 | else: 276 | train(0, a, h) 277 | 278 | 279 | if __name__ == '__main__': 280 | main() 281 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import matplotlib 4 | import torch 5 | from torch.nn.utils import weight_norm 6 | matplotlib.use("Agg") 7 | import matplotlib.pylab as plt 8 | 9 | 10 | def plot_spectrogram(spectrogram): 11 | fig, ax = plt.subplots(figsize=(10, 2)) 12 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 13 | interpolation='none') 14 | plt.colorbar(im, ax=ax) 15 | 16 | fig.canvas.draw() 17 | plt.close() 18 | 19 | return fig 20 | 21 | 22 | def init_weights(m, mean=0.0, std=0.01): 23 | classname = m.__class__.__name__ 24 | if classname.find("Conv") != -1: 25 | m.weight.data.normal_(mean, std) 26 | 27 | 28 | def apply_weight_norm(m): 29 | classname = m.__class__.__name__ 30 | if classname.find("Conv") != -1: 31 | weight_norm(m) 32 | 33 | 34 | def get_padding(kernel_size, dilation=1): 35 | return int((kernel_size*dilation - dilation)/2) 36 | 37 | 38 | def load_checkpoint(filepath, device): 39 | assert os.path.isfile(filepath) 40 | print("Loading '{}'".format(filepath)) 41 | checkpoint_dict = torch.load(filepath, map_location=device) 42 | print("Complete.") 43 | return checkpoint_dict 44 | 45 | 46 | def save_checkpoint(filepath, obj): 47 | print("Saving checkpoint to {}".format(filepath)) 48 | torch.save(obj, filepath) 49 | print("Complete.") 50 | 51 | 52 | def scan_checkpoint(cp_dir, prefix): 53 | pattern = os.path.join(cp_dir, prefix + '????????') 54 | cp_list = glob.glob(pattern) 55 | if len(cp_list) == 0: 56 | return None 57 | return sorted(cp_list)[-1] 58 | 59 | --------------------------------------------------------------------------------