├── .gitignore ├── LICENSE ├── README.md ├── classification ├── attention_lstm.py ├── attention_utils.py ├── cnn_bilstm.py ├── conv_1d_model.py ├── conv_1d_model_aws.py ├── conv_1d_model_run.py ├── log.txt ├── logs │ └── split_0.15_batchsize_10 │ │ ├── events.out.tfevents.1569179240.elem │ │ └── events.out.tfevents.1569217478.elem ├── lstm_cnn.py ├── model_5epochs_rnn.h5 ├── model_hin_tel_38_samples.h5 ├── rnn_example.py ├── testing.py └── testing.pyc ├── data ├── all_accents │ ├── accent_trim_gen_x.py │ ├── accent_trim_gen_y.py │ ├── all_accents_split.log.save │ ├── all_accents_trim.sh │ ├── alt_split.py │ ├── mfcc.py │ └── split_to_wav.sh ├── folder_structure ├── non_trained_accents │ ├── accent_trim_gen_x.py │ ├── accent_trim_gen_y.py │ └── mfcc.py └── numpy_vectors │ ├── conv1d.ipynb - Colaboratory3:1.pdf │ ├── conv1d.ipynb - Colaboratoryacc99ep10.pdf │ ├── conv1d.ipynb - Colaboratoryacc99ep10new.pdf │ ├── conv1d.ipynb - Colaboratoryacc99ep12.pdf │ └── conv1d.ipynb acc1 splithalf - Colaboratory.pdf ├── harvard_sentences.txt ├── helpers ├── __init__.py ├── alt_split.py ├── convert_raw_to_processed.py ├── file_manager.py ├── run.py ├── split_to_wav.sh.save └── split_wav.py ├── ipynb-htmls ├── all_accents.html ├── all_accents_2.html ├── conv1d (1).html ├── conv1d (1).ipynb ├── conv1d (2).ipynb ├── conv1d (3).ipynb ├── conv1d.html ├── conv1d.ipynb ├── train_on_one_person_test_on_other_conv1d (1).html ├── train_on_one_person_test_on_other_conv1d (1).ipynb └── train_on_one_person_test_on_other_conv1d.ipynb ├── notebooks ├── AccentDB_Classification_Colab.ipynb └── pase.ipynb ├── repo.tree └── speech2vec ├── all_split.sh ├── gen_x.py ├── gen_y.py ├── mfcc.py └── mp3_getter.py /.gitignore: -------------------------------------------------------------------------------- 1 | ###Python### 2 | 3 | # dataset and proccessed 4 | *.wav 5 | *.npy 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | .static_storage/ 62 | .media/ 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | 112 | 113 | ###IntelliJ### 114 | 115 | *.iml 116 | *.ipr 117 | *.iws 118 | .idea/ 119 | 120 | 121 | ###VisualStudio### 122 | 123 | ## Ignore Visual Studio temporary files, build results, and 124 | ## files generated by popular Visual Studio add-ons. 125 | ## 126 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 127 | 128 | # User-specific files 129 | *.suo 130 | *.user 131 | *.userosscache 132 | *.sln.docstates 133 | 134 | # User-specific files (MonoDevelop/Xamarin Studio) 135 | *.userprefs 136 | 137 | # Build results 138 | [Dd]ebug/ 139 | [Dd]ebugPublic/ 140 | [Rr]elease/ 141 | [Rr]eleases/ 142 | x64/ 143 | x86/ 144 | bld/ 145 | [Bb]in/ 146 | [Oo]bj/ 147 | [Ll]og/ 148 | 149 | # Visual Studio 2015/2017 cache/options directory 150 | .vs/ 151 | # Uncomment if you have tasks that create the project's static files in wwwroot 152 | #wwwroot/ 153 | 154 | # Visual Studio 2017 auto generated files 155 | Generated\ Files/ 156 | 157 | # MSTest test Results 158 | [Tt]est[Rr]esult*/ 159 | [Bb]uild[Ll]og.* 160 | 161 | # NUNIT 162 | *.VisualState.xml 163 | TestResult.xml 164 | 165 | # Build Results of an ATL Project 166 | [Dd]ebugPS/ 167 | [Rr]eleasePS/ 168 | dlldata.c 169 | 170 | # Benchmark Results 171 | BenchmarkDotNet.Artifacts/ 172 | 173 | # .NET Core 174 | project.lock.json 175 | project.fragment.lock.json 176 | artifacts/ 177 | **/Properties/launchSettings.json 178 | 179 | # StyleCop 180 | StyleCopReport.xml 181 | 182 | # Files built by Visual Studio 183 | *_i.c 184 | *_p.c 185 | *_i.h 186 | *.ilk 187 | *.meta 188 | *.obj 189 | *.pch 190 | *.pdb 191 | *.pgc 192 | *.pgd 193 | *.rsp 194 | *.sbr 195 | *.tlb 196 | *.tli 197 | *.tlh 198 | *.tmp 199 | *.tmp_proj 200 | *.log 201 | *.vspscc 202 | *.vssscc 203 | .builds 204 | *.pidb 205 | *.svclog 206 | *.scc 207 | 208 | # Chutzpah Test files 209 | _Chutzpah* 210 | 211 | # Visual C++ cache files 212 | ipch/ 213 | *.aps 214 | *.ncb 215 | *.opendb 216 | *.opensdf 217 | *.sdf 218 | *.cachefile 219 | *.VC.db 220 | *.VC.VC.opendb 221 | 222 | # Visual Studio profiler 223 | *.psess 224 | *.vsp 225 | *.vspx 226 | *.sap 227 | 228 | # Visual Studio Trace Files 229 | *.e2e 230 | 231 | # TFS 2012 Local Workspace 232 | $tf/ 233 | 234 | # Guidance Automation Toolkit 235 | *.gpState 236 | 237 | # ReSharper is a .NET coding add-in 238 | _ReSharper*/ 239 | *.[Rr]e[Ss]harper 240 | *.DotSettings.user 241 | 242 | # JustCode is a .NET coding add-in 243 | .JustCode 244 | 245 | # TeamCity is a build add-in 246 | _TeamCity* 247 | 248 | # DotCover is a Code Coverage Tool 249 | *.dotCover 250 | 251 | # AxoCover is a Code Coverage Tool 252 | .axoCover/* 253 | !.axoCover/settings.json 254 | 255 | # Visual Studio code coverage results 256 | *.coverage 257 | *.coveragexml 258 | 259 | # NCrunch 260 | _NCrunch_* 261 | .*crunch*.local.xml 262 | nCrunchTemp_* 263 | 264 | # MightyMoose 265 | *.mm.* 266 | AutoTest.Net/ 267 | 268 | # Web workbench (sass) 269 | .sass-cache/ 270 | 271 | # Installshield output folder 272 | [Ee]xpress/ 273 | 274 | # DocProject is a documentation generator add-in 275 | DocProject/buildhelp/ 276 | DocProject/Help/*.HxT 277 | DocProject/Help/*.HxC 278 | DocProject/Help/*.hhc 279 | DocProject/Help/*.hhk 280 | DocProject/Help/*.hhp 281 | DocProject/Help/Html2 282 | DocProject/Help/html 283 | 284 | # Click-Once directory 285 | publish/ 286 | 287 | # Publish Web Output 288 | *.[Pp]ublish.xml 289 | *.azurePubxml 290 | # Note: Comment the next line if you want to checkin your web deploy settings, 291 | # but database connection strings (with potential passwords) will be unencrypted 292 | *.pubxml 293 | *.publishproj 294 | 295 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 296 | # checkin your Azure Web App publish settings, but sensitive information contained 297 | # in these scripts will be unencrypted 298 | PublishScripts/ 299 | 300 | # NuGet Packages 301 | *.nupkg 302 | # The packages folder can be ignored because of Package Restore 303 | **/[Pp]ackages/* 304 | # except build/, which is used as an MSBuild target. 305 | !**/[Pp]ackages/build/ 306 | # Uncomment if necessary however generally it will be regenerated when needed 307 | #!**/[Pp]ackages/repositories.config 308 | # NuGet v3's project.json files produces more ignorable files 309 | *.nuget.props 310 | *.nuget.targets 311 | 312 | # Microsoft Azure Build Output 313 | csx/ 314 | *.build.csdef 315 | 316 | # Microsoft Azure Emulator 317 | ecf/ 318 | rcf/ 319 | 320 | # Windows Store app package directories and files 321 | AppPackages/ 322 | BundleArtifacts/ 323 | Package.StoreAssociation.xml 324 | _pkginfo.txt 325 | *.appx 326 | 327 | # Visual Studio cache files 328 | # files ending in .cache can be ignored 329 | *.[Cc]ache 330 | # but keep track of directories ending in .cache 331 | !*.[Cc]ache/ 332 | 333 | # Others 334 | ClientBin/ 335 | ~$* 336 | *~ 337 | *.dbmdl 338 | *.dbproj.schemaview 339 | *.jfm 340 | *.pfx 341 | *.publishsettings 342 | orleans.codegen.cs 343 | 344 | # Including strong name files can present a security risk 345 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 346 | #*.snk 347 | 348 | # Since there are multiple workflows, uncomment next line to ignore bower_components 349 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 350 | #bower_components/ 351 | 352 | # RIA/Silverlight projects 353 | Generated_Code/ 354 | 355 | # Backup & report files from converting an old project file 356 | # to a newer Visual Studio version. Backup files are not needed, 357 | # because we have git ;-) 358 | _UpgradeReport_Files/ 359 | Backup*/ 360 | UpgradeLog*.XML 361 | UpgradeLog*.htm 362 | ServiceFabricBackup/ 363 | 364 | # SQL Server files 365 | *.mdf 366 | *.ldf 367 | *.ndf 368 | 369 | # Business Intelligence projects 370 | *.rdl.data 371 | *.bim.layout 372 | *.bim_*.settings 373 | 374 | # Microsoft Fakes 375 | FakesAssemblies/ 376 | 377 | # GhostDoc plugin setting file 378 | *.GhostDoc.xml 379 | 380 | # Node.js Tools for Visual Studio 381 | .ntvs_analysis.dat 382 | node_modules/ 383 | 384 | # TypeScript v1 declaration files 385 | typings/ 386 | 387 | # Visual Studio 6 build log 388 | *.plg 389 | 390 | # Visual Studio 6 workspace options file 391 | *.opt 392 | 393 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 394 | *.vbw 395 | 396 | # Visual Studio LightSwitch build output 397 | **/*.HTMLClient/GeneratedArtifacts 398 | **/*.DesktopClient/GeneratedArtifacts 399 | **/*.DesktopClient/ModelManifest.xml 400 | **/*.Server/GeneratedArtifacts 401 | **/*.Server/ModelManifest.xml 402 | _Pvt_Extensions 403 | 404 | # Paket dependency manager 405 | .paket/paket.exe 406 | paket-files/ 407 | 408 | # FAKE - F# Make 409 | .fake/ 410 | 411 | # JetBrains Rider 412 | .idea/ 413 | *.sln.iml 414 | 415 | # CodeRush 416 | .cr/ 417 | 418 | # Python Tools for Visual Studio (PTVS) 419 | __pycache__/ 420 | *.pyc 421 | 422 | # Cake - Uncomment if you are using it 423 | # tools/** 424 | # !tools/packages.config 425 | 426 | # Tabs Studio 427 | *.tss 428 | 429 | # Telerik's JustMock configuration file 430 | *.jmconfig 431 | 432 | # BizTalk build output 433 | *.btp.cs 434 | *.btm.cs 435 | *.odx.cs 436 | *.xsd.cs 437 | 438 | # OpenCover UI analysis results 439 | OpenCover/ 440 | 441 | # Azure Stream Analytics local run output 442 | ASALocalRun/ 443 | 444 | # MSBuild Binary and Structured Log 445 | *.binlog -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | "AccentDB: A Database of Non-Native English Accents to Assist Neural 2 | Speech Recognition" (c) by Ahamad, Afroz; Anand, Ankit; Bhargava, 3 | Pranesh. 4 | 5 | "AccentDB: A Database of Non-Native English Accents to Assist Neural 6 | Speech Recognition" is licensed under a Creative Commons 7 | Attribution-NonCommercial 4.0 International License. 8 | 9 | Attribution-NonCommercial 4.0 International 10 | 11 | ======================================================================= 12 | 13 | Creative Commons Corporation ("Creative Commons") is not a law firm and 14 | does not provide legal services or legal advice. Distribution of 15 | Creative Commons public licenses does not create a lawyer-client or 16 | other relationship. Creative Commons makes its licenses and related 17 | information available on an "as-is" basis. Creative Commons gives no 18 | warranties regarding its licenses, any material licensed under their 19 | terms and conditions, or any related information. Creative Commons 20 | disclaims all liability for damages resulting from their use to the 21 | fullest extent possible. 22 | 23 | Using Creative Commons Public Licenses 24 | 25 | Creative Commons public licenses provide a standard set of terms and 26 | conditions that creators and other rights holders may use to share 27 | original works of authorship and other material subject to copyright 28 | and certain other rights specified in the public license below. The 29 | following considerations are for informational purposes only, are not 30 | exhaustive, and do not form part of our licenses. 31 | 32 | Considerations for licensors: Our public licenses are 33 | intended for use by those authorized to give the public 34 | permission to use material in ways otherwise restricted by 35 | copyright and certain other rights. Our licenses are 36 | irrevocable. Licensors should read and understand the terms 37 | and conditions of the license they choose before applying it. 38 | Licensors should also secure all rights necessary before 39 | applying our licenses so that the public can reuse the 40 | material as expected. Licensors should clearly mark any 41 | material not subject to the license. This includes other CC- 42 | licensed material, or material used under an exception or 43 | limitation to copyright. More considerations for licensors: 44 | wiki.creativecommons.org/Considerations_for_licensors 45 | 46 | Considerations for the public: By using one of our public 47 | licenses, a licensor grants the public permission to use the 48 | licensed material under specified terms and conditions. If 49 | the licensor's permission is not necessary for any reason--for 50 | example, because of any applicable exception or limitation to 51 | copyright--then that use is not regulated by the license. Our 52 | licenses grant only permissions under copyright and certain 53 | other rights that a licensor has authority to grant. Use of 54 | the licensed material may still be restricted for other 55 | reasons, including because others have copyright or other 56 | rights in the material. A licensor may make special requests, 57 | such as asking that all changes be marked or described. 58 | Although not required by our licenses, you are encouraged to 59 | respect those requests where reasonable. More considerations 60 | for the public: 61 | wiki.creativecommons.org/Considerations_for_licensees 62 | 63 | ======================================================================= 64 | 65 | Creative Commons Attribution-NonCommercial 4.0 International Public 66 | License 67 | 68 | By exercising the Licensed Rights (defined below), You accept and agree 69 | to be bound by the terms and conditions of this Creative Commons 70 | Attribution-NonCommercial 4.0 International Public License ("Public 71 | License"). To the extent this Public License may be interpreted as a 72 | contract, You are granted the Licensed Rights in consideration of Your 73 | acceptance of these terms and conditions, and the Licensor grants You 74 | such rights in consideration of benefits the Licensor receives from 75 | making the Licensed Material available under these terms and 76 | conditions. 77 | 78 | 79 | Section 1 -- Definitions. 80 | 81 | a. Adapted Material means material subject to Copyright and Similar 82 | Rights that is derived from or based upon the Licensed Material 83 | and in which the Licensed Material is translated, altered, 84 | arranged, transformed, or otherwise modified in a manner requiring 85 | permission under the Copyright and Similar Rights held by the 86 | Licensor. For purposes of this Public License, where the Licensed 87 | Material is a musical work, performance, or sound recording, 88 | Adapted Material is always produced where the Licensed Material is 89 | synched in timed relation with a moving image. 90 | 91 | b. Adapter's License means the license You apply to Your Copyright 92 | and Similar Rights in Your contributions to Adapted Material in 93 | accordance with the terms and conditions of this Public License. 94 | 95 | c. Copyright and Similar Rights means copyright and/or similar rights 96 | closely related to copyright including, without limitation, 97 | performance, broadcast, sound recording, and Sui Generis Database 98 | Rights, without regard to how the rights are labeled or 99 | categorized. For purposes of this Public License, the rights 100 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 101 | Rights. 102 | d. Effective Technological Measures means those measures that, in the 103 | absence of proper authority, may not be circumvented under laws 104 | fulfilling obligations under Article 11 of the WIPO Copyright 105 | Treaty adopted on December 20, 1996, and/or similar international 106 | agreements. 107 | 108 | e. Exceptions and Limitations means fair use, fair dealing, and/or 109 | any other exception or limitation to Copyright and Similar Rights 110 | that applies to Your use of the Licensed Material. 111 | 112 | f. Licensed Material means the artistic or literary work, database, 113 | or other material to which the Licensor applied this Public 114 | License. 115 | 116 | g. Licensed Rights means the rights granted to You subject to the 117 | terms and conditions of this Public License, which are limited to 118 | all Copyright and Similar Rights that apply to Your use of the 119 | Licensed Material and that the Licensor has authority to license. 120 | 121 | h. Licensor means the individual(s) or entity(ies) granting rights 122 | under this Public License. 123 | 124 | i. NonCommercial means not primarily intended for or directed towards 125 | commercial advantage or monetary compensation. For purposes of 126 | this Public License, the exchange of the Licensed Material for 127 | other material subject to Copyright and Similar Rights by digital 128 | file-sharing or similar means is NonCommercial provided there is 129 | no payment of monetary compensation in connection with the 130 | exchange. 131 | 132 | j. Share means to provide material to the public by any means or 133 | process that requires permission under the Licensed Rights, such 134 | as reproduction, public display, public performance, distribution, 135 | dissemination, communication, or importation, and to make material 136 | available to the public including in ways that members of the 137 | public may access the material from a place and at a time 138 | individually chosen by them. 139 | 140 | k. Sui Generis Database Rights means rights other than copyright 141 | resulting from Directive 96/9/EC of the European Parliament and of 142 | the Council of 11 March 1996 on the legal protection of databases, 143 | as amended and/or succeeded, as well as other essentially 144 | equivalent rights anywhere in the world. 145 | 146 | l. You means the individual or entity exercising the Licensed Rights 147 | under this Public License. Your has a corresponding meaning. 148 | 149 | 150 | Section 2 -- Scope. 151 | 152 | a. License grant. 153 | 154 | 1. Subject to the terms and conditions of this Public License, 155 | the Licensor hereby grants You a worldwide, royalty-free, 156 | non-sublicensable, non-exclusive, irrevocable license to 157 | exercise the Licensed Rights in the Licensed Material to: 158 | 159 | a. reproduce and Share the Licensed Material, in whole or 160 | in part, for NonCommercial purposes only; and 161 | 162 | b. produce, reproduce, and Share Adapted Material for 163 | NonCommercial purposes only. 164 | 165 | 2. Exceptions and Limitations. For the avoidance of doubt, where 166 | Exceptions and Limitations apply to Your use, this Public 167 | License does not apply, and You do not need to comply with 168 | its terms and conditions. 169 | 170 | 3. Term. The term of this Public License is specified in Section 171 | 6(a). 172 | 173 | 4. Media and formats; technical modifications allowed. The 174 | Licensor authorizes You to exercise the Licensed Rights in 175 | all media and formats whether now known or hereafter created, 176 | and to make technical modifications necessary to do so. The 177 | Licensor waives and/or agrees not to assert any right or 178 | authority to forbid You from making technical modifications 179 | necessary to exercise the Licensed Rights, including 180 | technical modifications necessary to circumvent Effective 181 | Technological Measures. For purposes of this Public License, 182 | simply making modifications authorized by this Section 2(a) 183 | (4) never produces Adapted Material. 184 | 185 | 5. Downstream recipients. 186 | 187 | a. Offer from the Licensor -- Licensed Material. Every 188 | recipient of the Licensed Material automatically 189 | receives an offer from the Licensor to exercise the 190 | Licensed Rights under the terms and conditions of this 191 | Public License. 192 | 193 | b. No downstream restrictions. You may not offer or impose 194 | any additional or different terms or conditions on, or 195 | apply any Effective Technological Measures to, the 196 | Licensed Material if doing so restricts exercise of the 197 | Licensed Rights by any recipient of the Licensed 198 | Material. 199 | 200 | 6. No endorsement. Nothing in this Public License constitutes or 201 | may be construed as permission to assert or imply that You 202 | are, or that Your use of the Licensed Material is, connected 203 | with, or sponsored, endorsed, or granted official status by, 204 | the Licensor or others designated to receive attribution as 205 | provided in Section 3(a)(1)(A)(i). 206 | 207 | b. Other rights. 208 | 209 | 1. Moral rights, such as the right of integrity, are not 210 | licensed under this Public License, nor are publicity, 211 | privacy, and/or other similar personality rights; however, to 212 | the extent possible, the Licensor waives and/or agrees not to 213 | assert any such rights held by the Licensor to the limited 214 | extent necessary to allow You to exercise the Licensed 215 | Rights, but not otherwise. 216 | 217 | 2. Patent and trademark rights are not licensed under this 218 | Public License. 219 | 220 | 3. To the extent possible, the Licensor waives any right to 221 | collect royalties from You for the exercise of the Licensed 222 | Rights, whether directly or through a collecting society 223 | under any voluntary or waivable statutory or compulsory 224 | licensing scheme. In all other cases the Licensor expressly 225 | reserves any right to collect such royalties, including when 226 | the Licensed Material is used other than for NonCommercial 227 | purposes. 228 | 229 | 230 | Section 3 -- License Conditions. 231 | 232 | Your exercise of the Licensed Rights is expressly made subject to the 233 | following conditions. 234 | 235 | a. Attribution. 236 | 237 | 1. If You Share the Licensed Material (including in modified 238 | form), You must: 239 | 240 | a. retain the following if it is supplied by the Licensor 241 | with the Licensed Material: 242 | 243 | i. identification of the creator(s) of the Licensed 244 | Material and any others designated to receive 245 | attribution, in any reasonable manner requested by 246 | the Licensor (including by pseudonym if 247 | designated); 248 | 249 | ii. a copyright notice; 250 | 251 | iii. a notice that refers to this Public License; 252 | 253 | iv. a notice that refers to the disclaimer of 254 | warranties; 255 | 256 | v. a URI or hyperlink to the Licensed Material to the 257 | extent reasonably practicable; 258 | 259 | b. indicate if You modified the Licensed Material and 260 | retain an indication of any previous modifications; and 261 | 262 | c. indicate the Licensed Material is licensed under this 263 | Public License, and include the text of, or the URI or 264 | hyperlink to, this Public License. 265 | 266 | 2. You may satisfy the conditions in Section 3(a)(1) in any 267 | reasonable manner based on the medium, means, and context in 268 | which You Share the Licensed Material. For example, it may be 269 | reasonable to satisfy the conditions by providing a URI or 270 | hyperlink to a resource that includes the required 271 | information. 272 | 273 | 3. If requested by the Licensor, You must remove any of the 274 | information required by Section 3(a)(1)(A) to the extent 275 | reasonably practicable. 276 | 277 | 4. If You Share Adapted Material You produce, the Adapter's 278 | License You apply must not prevent recipients of the Adapted 279 | Material from complying with this Public License. 280 | 281 | 282 | Section 4 -- Sui Generis Database Rights. 283 | 284 | Where the Licensed Rights include Sui Generis Database Rights that 285 | apply to Your use of the Licensed Material: 286 | 287 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 288 | to extract, reuse, reproduce, and Share all or a substantial 289 | portion of the contents of the database for NonCommercial purposes 290 | only; 291 | 292 | b. if You include all or a substantial portion of the database 293 | contents in a database in which You have Sui Generis Database 294 | Rights, then the database in which You have Sui Generis Database 295 | Rights (but not its individual contents) is Adapted Material; and 296 | 297 | c. You must comply with the conditions in Section 3(a) if You Share 298 | all or a substantial portion of the contents of the database. 299 | 300 | For the avoidance of doubt, this Section 4 supplements and does not 301 | replace Your obligations under this Public License where the Licensed 302 | Rights include other Copyright and Similar Rights. 303 | 304 | 305 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 306 | 307 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 308 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 309 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 310 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 311 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 312 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 313 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 314 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 315 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 316 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 317 | 318 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 319 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 320 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 321 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 322 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 323 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 324 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 325 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 326 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 327 | 328 | c. The disclaimer of warranties and limitation of liability provided 329 | above shall be interpreted in a manner that, to the extent 330 | possible, most closely approximates an absolute disclaimer and 331 | waiver of all liability. 332 | 333 | 334 | Section 6 -- Term and Termination. 335 | 336 | a. This Public License applies for the term of the Copyright and 337 | Similar Rights licensed here. However, if You fail to comply with 338 | this Public License, then Your rights under this Public License 339 | terminate automatically. 340 | 341 | b. Where Your right to use the Licensed Material has terminated under 342 | Section 6(a), it reinstates: 343 | 344 | 1. automatically as of the date the violation is cured, provided 345 | it is cured within 30 days of Your discovery of the 346 | violation; or 347 | 348 | 2. upon express reinstatement by the Licensor. 349 | 350 | For the avoidance of doubt, this Section 6(b) does not affect any 351 | right the Licensor may have to seek remedies for Your violations 352 | of this Public License. 353 | 354 | c. For the avoidance of doubt, the Licensor may also offer the 355 | Licensed Material under separate terms or conditions or stop 356 | distributing the Licensed Material at any time; however, doing so 357 | will not terminate this Public License. 358 | 359 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 360 | License. 361 | 362 | 363 | Section 7 -- Other Terms and Conditions. 364 | 365 | a. The Licensor shall not be bound by any additional or different 366 | terms or conditions communicated by You unless expressly agreed. 367 | 368 | b. Any arrangements, understandings, or agreements regarding the 369 | Licensed Material not stated herein are separate from and 370 | independent of the terms and conditions of this Public License. 371 | 372 | 373 | Section 8 -- Interpretation. 374 | 375 | a. For the avoidance of doubt, this Public License does not, and 376 | shall not be interpreted to, reduce, limit, restrict, or impose 377 | conditions on any use of the Licensed Material that could lawfully 378 | be made without permission under this Public License. 379 | 380 | b. To the extent possible, if any provision of this Public License is 381 | deemed unenforceable, it shall be automatically reformed to the 382 | minimum extent necessary to make it enforceable. If the provision 383 | cannot be reformed, it shall be severed from this Public License 384 | without affecting the enforceability of the remaining terms and 385 | conditions. 386 | 387 | c. No term or condition of this Public License will be waived and no 388 | failure to comply consented to unless expressly agreed to by the 389 | Licensor. 390 | 391 | d. Nothing in this Public License constitutes or may be interpreted 392 | as a limitation upon, or waiver of, any privileges and immunities 393 | that apply to the Licensor or You, including from the legal 394 | processes of any jurisdiction or authority. 395 | 396 | ======================================================================= 397 | 398 | Creative Commons is not a party to its public 399 | licenses. Notwithstanding, Creative Commons may elect to apply one of 400 | its public licenses to material it publishes and in those instances 401 | will be considered the “Licensor.” The text of the Creative Commons 402 | public licenses is dedicated to the public domain under the CC0 Public 403 | Domain Dedication. Except for the limited purpose of indicating that 404 | material is shared under a Creative Commons public license or as 405 | otherwise permitted by the Creative Commons policies published at 406 | creativecommons.org/policies, Creative Commons does not authorize the 407 | use of the trademark "Creative Commons" or any other trademark or logo 408 | of Creative Commons without its prior written consent including, 409 | without limitation, in connection with any unauthorized modifications 410 | to any of its public licenses or any other arrangements, 411 | understandings, or agreements concerning use of licensed material. For 412 | the avoidance of doubt, this paragraph does not form part of the 413 | public licenses. 414 | 415 | Creative Commons may be contacted at creativecommons.org. 416 | 417 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AccentDB 2 | [A Database of Non-Native English Accents to Assist Neural Speech Recognition](https://accentdb.github.io/) 3 | 4 | ## Dataset 5 | The current release **v1.0** of AccentDB has three datasets that can be downloaded from [here](https://accentdb.github.io/#dataset). 6 | 7 | | Title | Description | Notes | 8 | |:--------- | :---------- | --------: | 9 | |**accentdb_core**| 4 non-native Indian English accents collected by authors. | 6,587 files | 10 | |**accentdb_extended**| Samples for 5 English Accents + 4 accents from accentdb_core. | 19,111 files| 11 | |**accentdb_raw**| Raw and unprocessed recordings for the core dataset. | 11 files | 12 | 13 | ## Embedding Visualization 14 | The one-speaker-per-accent 600 sample vectors and metadata can be found at [AccentDB/embedding-150](https://github.com/AccentDB/embedding-150); and the projection at [Embedding Projector](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/AccentDB/embedding-150/master/template_projector_config.json). 15 | 16 | Larger vectors and metadata files can be downloaded from here. 17 | - [accents-4-samples-250](https://drive.google.com/drive/folders/1ECGDOxcFAMp9y-yCBTy4d1M2Bb8fkp3r?usp=sharing): 1,000 rows. 18 | - [accents-4-samples-700](https://drive.google.com/drive/folders/1d7pyl2AwmnEgVvGTeNjOYgKRa_awUnjN?usp=sharing): 2,800 rows. 19 | - [accents-9-samples-250](https://drive.google.com/drive/folders/16vkVq36zTFGB2p0-QL7PZQRLGQkk9yQ7?usp=sharing): 22,500 rows. 20 | 21 | ## Colab 22 | 23 | Run the following colab to experiment with classification model on a smaller AccentDB dataset. 24 | [conv_classfication_multi_setup.ipynb](https://colab.research.google.com/drive/1J_pTtmY98vtWHWoIs9WhRuWK0b0uBuxU) 25 | 26 | A static version of the colab can se accessed [here](./notebooks/AccentDB_Classification_Colab.ipynb). 27 | 28 | --------- 29 | 30 | ## Code 31 | The steps below are required if you want to work with the raw recordings. We share the scripts that we used to clean and preprocess the recordings. We also share code to train and test the different models. 32 | 33 | `repo.tree` contains the structure of the repo including `.npy` and `.wav` files. These files are not tracked by git. 34 | 35 | > We are thankful to https://github.com/dwww2012/Accent-Classifier for providing with code for preprocessing and generating MFCC vectors. 36 | 37 | ### Preprocessing .wav recordings 38 | 39 | #### Step 1: Convert .mp3 files to .wav 40 | Use the following script to convert all .mp3 files to .wav files. 41 | ``` 42 | for file in *.mp3 43 | do 44 | ffmpeg -i "$file" "$file".wav 45 | done 46 | ``` 47 | This makes .wav files with .mp3.wav names from which the .mp3 can be removed using a bulk remave via: 48 | ``` 49 | $ qmv -f do 50 | ``` 51 | 52 | #### Step 2: Split hour long .wav recordings to sentence level 53 | 54 | This is done using `split_to_wav.py` present in the corresponding folders or a generic file `helpers/alt_split.py`. 55 | The splitting is done based on silence thresholds in terms of duration and energy. The threholds that were used for the experiments are noted below: 56 | ``` 57 | 58 | Splitting Bangla_Arc.wav where energy is below 1.0% for longer than 2.0s. 59 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 20134/20143 [00:43<00:00, 459.96it/s] 60 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 778/778 [00:13<00:00, 56.96it/s] 61 | Splitting Bangla_Jay.wav where energy is below 1.0% for longer than 2.0s. 62 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19824/19833 [00:44<00:00, 447.85it/s] 63 | 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:06<00:00, 110.16it/s] 64 | Splitting Malayalam_Hab.wav where energy is below 1.0% for longer than 2.0s. 65 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19902/19911 [01:01<00:00, 323.10it/s] 66 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [00:08<00:00, 84.67it/s] 67 | Splitting Malayalam_Sal.wav where energy is below 1.0% for longer than 2.0s. 68 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19969/19978 [00:59<00:00, 334.94it/s] 69 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:11<00:00, 2.54it/s] 70 | Splitting Malayalam_Sha.wav where energy is below 1.0% for longer than 2.0s. 71 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 23904/23913 [01:06<00:00, 357.02it/s] 72 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 895/895 [00:23<00:00, 38.31it/s] 73 | Splitting Odiya_Suc.wav where energy is below 1.0% for longer than 2.0s. 74 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19675/19684 [01:06<00:00, 294.34it/s] 75 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:09<00:00, 12.26it/s] 76 | Splitting Telugu_Nav.wav where energy is below 1.0% for longer than 2.0s. 77 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19554/19563 [00:54<00:00, 356.87it/s] 78 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 766/766 [00:09<00:00, 83.00it/s] 79 | Splitting Telugu_Tho.wav where energy is below 1.0% for longer than 2.0s. 80 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19347/19356 [01:06<00:00, 291.94it/s] 81 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 749/749 [00:07<00:00, 95.36it/s] 82 | 83 | Updated with: 84 | Splitting /home/enigmaeth/accentPhase2/data/all_accents/Malayalam_Sal.wav where energy is below 0.1% for longer than 2.0s. 85 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19969/19978 [00:45<00:00, 443.22it/s] 86 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 747/747 [00:08<00:00, 91.82it/s] 87 | 88 | 89 | Splitting /home/enigmaeth/accentPhase2/data/all_accents/Odiya_Suc.wav where energy is below 0.1% for longer than 2.0s. 90 | 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19675/19684 [00:41<00:00, 470.82it/s] 91 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 747/747 [00:08<00:00, 85.22it/s] 92 | 93 | ``` 94 | The number of files produced per recording is noted in the tqdm output above. For example: 747 files for the last row here. 95 | Most files are around 5 seconds while some files are 13 to 14 seconds long. 96 | 97 | #### Step 3: Trim all files to 5s. 98 | Run script similar to `/data/all_accents/all_accents_trim.sh` to trim all files to less than 5s. This command runs using `sox`, details here: https://stackoverflow.com/questions/9667081/how-do-you-trim-the-audio-files-end-using-sox 99 | 100 | #### Step 4: Generate X and Y vectors for training 101 | 102 | └── `speech2vec` 103 |     ├── `all_split.sh` : bash script to run all models on a given X and Y npy vectors. 104 |     ├── `gen_x.py`: generate MFCC vectors for all files in specified folder. 105 |     ├── `gen_y.py`: generate class labels for all files in specified folder. 106 |     ├── `mfcc.py`: MFCC utilty. 107 | 108 | `*.npy*` files are stored in `/data/numpy_vectors` or in the corresponding folder for some experiments. 109 | 110 | -------- 111 | ### Step 5: Classification using initial run with MFCC 112 | 113 | Run the following colabs for 2 experimental setups. 114 | 1. [conv1d on all_accents](https://colab.research.google.com/drive/1Z5vg1eRU3zCskrlTc2kp1y9xzUx8P9H8?authuser=2#scrollTo=Zz0tpQ_kiQNo) (Requires access request) 115 | 2. [train_on_one_person_and_test_on_other](https://colab.research.google.com/drive/1dMZxbFCPBc2gJkNM47F_j7lDtvVaDhxb?authuser=2#scrollTo=koL6wrhIq_em) (Requires access request) 116 | 117 | The results can be found inside `data/numpy_vectors/terminal.log`. 118 | 119 | Models ran: 120 | > ├── `classification` 121 | │     ├── `attention_lstm.py` 122 | │     ├── `attention_utils.py` 123 | │     ├── `cnn_bilstm.py` 124 | │     ├── `conv_1d_model_aws.py` 125 | │     ├── `conv_1d_model.py` 126 | │     ├── `conv_1d_model_run.py` 127 | 128 | --------- 129 | ## Citation 130 | If you have found our dataset or models to be useful, please cite us as below. 131 | ``` 132 | @InProceedings{ahamad-anand-bhargava:2020:LREC, 133 | author = {Ahamad, Afroz and Anand, Ankit and Bhargava, Pranesh}, 134 | title = {AccentDB: A Database of Non-Native English Accents to Assist Neural Speech Recognition}, 135 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, 136 | month = {May}, 137 | year = {2020}, 138 | address = {Marseille, France}, 139 | publisher = {European Language Resources Association}, 140 | pages = {5353--5360}, 141 | url = {https://www.aclweb.org/anthology/2020.lrec-1.659} 142 | } 143 | ``` 144 | ----- 145 | ## LICENSE 146 | AccentDB is licensed under a [CC BY-NC 4.0 License](./LICENSE). 147 | -------------------------------------------------------------------------------- /classification/attention_lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from keras.layers import multiply 3 | from keras.layers.core import * 4 | from keras.layers.recurrent import LSTM 5 | from keras.models import * 6 | 7 | import numpy as np 8 | 9 | from keras.optimizers import SGD 10 | 11 | 12 | np.random.seed(1337) # for reproducibility 13 | from keras.preprocessing import sequence 14 | from keras.utils import np_utils 15 | from keras.models import Sequential 16 | from keras.layers.core import Dense, Dropout, Activation 17 | from keras.layers.recurrent import LSTM 18 | from sklearn.model_selection import train_test_split 19 | from sklearn.metrics import classification_report 20 | 21 | from attention_utils import get_activations, get_data_recurrent 22 | nb_classes = 2 23 | INPUT_DIM = 13 24 | TIME_STEPS = 2999 25 | # if True, the attention vector is shared across the input_dimensions where the attention is applied. 26 | SINGLE_ATTENTION_VECTOR = False 27 | APPLY_ATTENTION_BEFORE_LSTM = True 28 | 29 | 30 | print('Loading data...') 31 | X = np.load('../data/numpy_vectors/x_label_splits.npy') 32 | y = np.load('../data/numpy_vectors/y_label_splits.npy') 33 | N = X.shape[0] 34 | X = X.repeat(2).repeat(2) 35 | y = y.repeat(2).repeat(2) 36 | X = X.reshape(4*N, 2999, 13) 37 | print(X.shape) 38 | print(y.shape) 39 | 40 | #X = X[:200] 41 | #y = y[:200] 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10) 43 | batch_size = 50 44 | 45 | Y_train = np_utils.to_categorical(y_train, nb_classes) 46 | Y_test = np_utils.to_categorical(y_test, nb_classes) 47 | #split__ = int((len(X_train)//batch_size)*batch_size) 48 | #X_train = X_train[:split__] 49 | #y_train = y_train[:split__] 50 | 51 | def attention_3d_block(inputs): 52 | # inputs.shape = (batch_size, time_steps, input_dim) 53 | input_dim = int(inputs.shape[2]) 54 | a = Permute((2, 1))(inputs) 55 | a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what. 56 | a = Dense(TIME_STEPS, activation='softmax')(a) 57 | if SINGLE_ATTENTION_VECTOR: 58 | a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a) 59 | a = RepeatVector(input_dim)(a) 60 | a_probs = Permute((2, 1), name='attention_vec')(a) 61 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul') 62 | return output_attention_mul 63 | 64 | 65 | def model_attention_applied_after_lstm(): 66 | inputs = Input(shape=(TIME_STEPS, INPUT_DIM,)) 67 | lstm_units = 100 68 | lstm_out = LSTM(lstm_units, return_sequences=True)(inputs) 69 | attention_mul = attention_3d_block(lstm_out) 70 | attention_mul = Flatten()(attention_mul) 71 | output = Dense(nb_classes, activation='sigmoid')(attention_mul) 72 | model = Model(input=[inputs], output=output) 73 | return model 74 | 75 | 76 | def model_attention_applied_before_lstm(): 77 | inputs = Input(shape=(TIME_STEPS, INPUT_DIM,)) 78 | attention_mul = attention_3d_block(inputs) 79 | lstm_units = 100 80 | attention_mul = LSTM(lstm_units, return_sequences=False)(attention_mul) 81 | output = Dense(nb_classes, activation='sigmoid')(attention_mul) 82 | model = Model(input=[inputs], output=output) 83 | return model 84 | 85 | 86 | if __name__ == '__main__': 87 | 88 | N = 300000 89 | # N = 300 -> too few = no training 90 | inputs_1, outputs = X_train, Y_train 91 | 92 | if APPLY_ATTENTION_BEFORE_LSTM: 93 | m = model_attention_applied_before_lstm() 94 | else: 95 | m = model_attention_applied_after_lstm() 96 | 97 | m.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 98 | print(m.summary()) 99 | 100 | m.fit([inputs_1], outputs, epochs=6, batch_size=64, validation_split=0.1) 101 | 102 | y_pred=m.predict(X_test, batch_size=batch_size) 103 | for i in range(len(y_pred)): 104 | print(y_pred[i], Y_test[i]) 105 | # print(classification_report(Y_test, y_pred)) 106 | # attention_vectors = [] 107 | # for i in range(X_test.shape[0]): 108 | # testing_inputs_1, testing_outputs = X_test[i], y_test[i] 109 | # attention_vector = np.mean(get_activations(m, 110 | # testing_inputs_1, 111 | # print_shape_only=True, 112 | # layer_name='attention_vec')[0], axis=2).squeeze() 113 | # print('attention =', attention_vector) 114 | # assert (np.sum(attention_vector) - 1.0) < 1e-5 115 | # attention_vectors.append(attention_vector) 116 | 117 | # attention_vector_final = np.mean(np.array(attention_vectors), axis=0) 118 | # # plot part. 119 | # import matplotlib.pyplot as plt 120 | # import pandas as pd 121 | 122 | # pd.DataFrame(attention_vector_final, columns=['attention (%)']).plot(kind='bar', 123 | # title='Attention Mechanism as ' 124 | # 'a function of input' 125 | # ' dimensions.') 126 | # plt.show() 127 | -------------------------------------------------------------------------------- /classification/attention_utils.py: -------------------------------------------------------------------------------- 1 | import keras.backend as K 2 | import numpy as np 3 | 4 | 5 | def get_activations(model, inputs, print_shape_only=False, layer_name=None): 6 | # Documentation is available online on Github at the address below. 7 | # From: https://github.com/philipperemy/keras-visualize-activations 8 | print('----- activations -----') 9 | activations = [] 10 | inp = model.input 11 | if layer_name is None: 12 | outputs = [layer.output for layer in model.layers] 13 | else: 14 | outputs = [layer.output for layer in model.layers if layer.name == layer_name] # all layer outputs 15 | funcs = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs] # evaluation functions 16 | layer_outputs = [func([inputs, 1.])[0] for func in funcs] 17 | for layer_activations in layer_outputs: 18 | activations.append(layer_activations) 19 | if print_shape_only: 20 | print(layer_activations.shape) 21 | else: 22 | print(layer_activations) 23 | return activations 24 | 25 | 26 | def get_data(n, input_dim, attention_column=1): 27 | """ 28 | Data generation. x is purely random except that it's first value equals the target y. 29 | In practice, the network should learn that the target = x[attention_column]. 30 | Therefore, most of its attention should be focused on the value addressed by attention_column. 31 | :param n: the number of samples to retrieve. 32 | :param input_dim: the number of dimensions of each element in the series. 33 | :param attention_column: the column linked to the target. Everything else is purely random. 34 | :return: x: model inputs, y: model targets 35 | """ 36 | x = np.random.standard_normal(size=(n, input_dim)) 37 | y = np.random.randint(low=0, high=2, size=(n, 1)) 38 | x[:, attention_column] = y[:, 0] 39 | return x, y 40 | 41 | 42 | def get_data_recurrent(n, time_steps, input_dim, attention_column=10): 43 | """ 44 | Data generation. x is purely random except that it's first value equals the target y. 45 | In practice, the network should learn that the target = x[attention_column]. 46 | Therefore, most of its attention should be focused on the value addressed by attention_column. 47 | :param n: the number of samples to retrieve. 48 | :param time_steps: the number of time steps of your series. 49 | :param input_dim: the number of dimensions of each element in the series. 50 | :param attention_column: the column linked to the target. Everything else is purely random. 51 | :return: x: model inputs, y: model targets 52 | """ 53 | x = np.random.standard_normal(size=(n, time_steps, input_dim)) 54 | y = np.random.randint(low=0, high=2, size=(n, 1)) 55 | x[:, attention_column, :] = np.tile(y[:], (1, input_dim)) 56 | return x, y 57 | -------------------------------------------------------------------------------- /classification/cnn_bilstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | from keras.optimizers import SGD 5 | from keras.callbacks import TensorBoard 6 | 7 | np.random.seed(1337) # for reproducibility 8 | from keras.preprocessing import sequence 9 | from keras.utils import np_utils 10 | from keras.models import Sequential 11 | from keras.layers import Dense, Bidirectional 12 | from keras.layers.core import Dense, Dropout, Activation, Flatten 13 | from keras.layers.recurrent import LSTM 14 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.preprocessing import sequence 17 | from keras.utils import np_utils 18 | from keras.layers.core import Dense, Dropout, Activation, Flatten 19 | 20 | from sklearn.model_selection import train_test_split 21 | from sklearn.metrics import classification_report 22 | 23 | # parameters 24 | test_dim = 2999 25 | maxlen = 100 26 | batch_size = 100 27 | nb_filter = 64 28 | filter_length_1 = 50 29 | filter_length_2 = 25 30 | hidden_dims = 250 31 | nb_epoch = 16 32 | nb_classes = 2 33 | 34 | print('Loading data...') 35 | X = np.load('../data/numpy_vectors/x_label_splits.npy') 36 | y = np.load('../data/numpy_vectors/y_label_splits.npy') 37 | print(X.shape) 38 | print(y.shape) 39 | 40 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) 41 | 42 | print(len(X_train), 'train sequences') 43 | print(len(X_test), 'test sequences') 44 | 45 | Y_train = np_utils.to_categorical(y_train, nb_classes) 46 | Y_test = np_utils.to_categorical(y_test, nb_classes) 47 | 48 | # create the model 49 | embedding_vecor_length = 32 50 | model = Sequential() 51 | 52 | # model.add(Conv1D(filters=32, input_shape=(test_dim, 13), kernel_size=3, padding='same', activation='relu')) 53 | model.add(Convolution1D(nb_filter=nb_filter, 54 | filter_length=filter_length_1, 55 | input_shape=(test_dim, 13), 56 | border_mode='valid', 57 | activation='relu' 58 | )) 59 | 60 | model.add(BatchNormalization()) 61 | 62 | 63 | model.add(Convolution1D(nb_filter=nb_filter, 64 | filter_length=filter_length_2, 65 | border_mode='same', 66 | activation='relu' 67 | )) 68 | 69 | model.add(BatchNormalization()) 70 | 71 | model.add(MaxPooling1D(pool_length=2)) 72 | 73 | model.add(Convolution1D(nb_filter=nb_filter, 74 | filter_length=filter_length_2, 75 | border_mode='same', 76 | activation='relu' 77 | )) 78 | 79 | model.add(BatchNormalization()) 80 | 81 | model.add(MaxPooling1D(pool_length=2)) 82 | 83 | # We flatten the output of the conv layer, 84 | # so that we can add a vanilla dense layer: 85 | 86 | # We add a vanilla hidden layer: 87 | # model.add(Dense(hidden_dims)) 88 | model.add(Dropout(0.25)) 89 | 90 | model.add(Bidirectional(LSTM(100, return_sequences=True))) 91 | model.add(Bidirectional(LSTM(100, return_sequences=True))) 92 | model.add(Bidirectional(LSTM(100))) 93 | 94 | model.add(Dropout(0.25)) 95 | 96 | model.add(Dense(128, activation='relu')) 97 | model.add(Dense(32, activation='relu')) 98 | model.add(Dense(nb_classes, activation='softmax')) 99 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 100 | 101 | print(model.summary()) 102 | 103 | model.fit(X_train, Y_train, batch_size=batch_size, 104 | nb_epoch=nb_epoch, verbose=1, validation_split=0.15) 105 | 106 | y_preds = model.predict(X_test) 107 | for i in range(len(y_preds)): 108 | print(y_preds[i], y_test[i]) 109 | # Final evaluation of the model 110 | scores = model.evaluate(X_test, Y_test, verbose=0) 111 | print("Accuracy: %.2f%%" % (scores[1]*100)) 112 | -------------------------------------------------------------------------------- /classification/conv_1d_model.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import classification_report 6 | from time import time 7 | #np.random.seed(1337) # for reproducibility 8 | 9 | from keras.preprocessing import sequence 10 | from keras.models import Sequential 11 | from keras.layers.core import Dense, Dropout, Activation, Flatten 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 14 | from keras.utils import np_utils 15 | from keras.callbacks import TensorBoard 16 | 17 | 18 | # set parameters: 19 | test_dim = 2999 20 | maxlen = 100 21 | nb_filter = 64 22 | filter_length_1 = 50 23 | filter_length_2 = 25 24 | hidden_dims = 250 25 | nb_epoch = 20 26 | nb_classes = 2 27 | split_ratio = 0.15 28 | 29 | print('Loading data...') 30 | 31 | X = np.load('../data/numpy_vectors/x_label_splits.npy') 32 | y = np.load('../data/numpy_vectors/y_label_splits.npy') 33 | print(X.shape) 34 | print(y.shape) 35 | 36 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio) 37 | 38 | xts = X_train.shape 39 | #X_train = np.reshape(X_train, (xts[0], xts[1], 1)) 40 | xtss = X_test.shape 41 | #X_test = np.reshape(X_test, (xtss[0], xtss[1], 1)) 42 | yts = y_train.shape 43 | #y_train = np.reshape(y_train, (yts[0], 1)) 44 | ytss = y_test.shape 45 | #y_test = np.reshape(y_test, (ytss[0], 1)) 46 | 47 | print(len(X_train), 'train sequences') 48 | print(len(X_test), 'test sequences') 49 | 50 | Y_train = np_utils.to_categorical(y_train, nb_classes) 51 | Y_test = np_utils.to_categorical(y_test, nb_classes) 52 | 53 | # print('Pad sequences (samples x time)') 54 | # X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 55 | # X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 56 | # print('X_train shape:', X_train.shape) 57 | # print('X_test shape:', X_test.shape) 58 | 59 | for batch_size in range(10, 11, 5): 60 | print('Build model...') 61 | model = Sequential() 62 | 63 | # we start off with an efficient embedding layer which maps 64 | # our vocab indices into embedding_dims dimensions 65 | # model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) 66 | # model.add(Dropout(0.25)) 67 | 68 | # we add a Convolution1D, which will learn nb_filter 69 | # word group filters of size filter_length: 70 | model.add(Convolution1D(nb_filter=nb_filter, 71 | filter_length=filter_length_1, 72 | input_shape=(test_dim, 13), 73 | border_mode='valid', 74 | activation='relu' 75 | )) 76 | # we use standard max pooling (halving the output of the previous layer): 77 | model.add(BatchNormalization()) 78 | 79 | model.add(Convolution1D(nb_filter=nb_filter, 80 | filter_length=filter_length_2, 81 | border_mode='same', 82 | activation='relu' 83 | )) 84 | 85 | model.add(BatchNormalization()) 86 | 87 | model.add(MaxPooling1D(pool_length=2)) 88 | 89 | model.add(Convolution1D(nb_filter=nb_filter, 90 | filter_length=filter_length_2, 91 | border_mode='same', 92 | activation='relu' 93 | )) 94 | 95 | model.add(BatchNormalization()) 96 | 97 | model.add(MaxPooling1D(pool_length=2)) 98 | 99 | # We flatten the output of the conv layer, 100 | # so that we can add a vanilla dense layer: 101 | model.add(Flatten()) 102 | 103 | # We add a vanilla hidden layer: 104 | # model.add(Dense(hidden_dims)) 105 | model.add(Dropout(0.25)) 106 | # model.add(Activation('relu')) 107 | 108 | # We project onto a single unit output layer, and squash it with a sigmoid: 109 | model.add(Dense(nb_classes)) 110 | model.add(Activation('softmax')) 111 | 112 | model.compile(loss='binary_crossentropy', 113 | optimizer='adam', metrics=['accuracy']) 114 | 115 | print("model/split = {} <> batchsize = {}".format(split_ratio, batch_size)) 116 | tensorboard = TensorBoard(log_dir="logs/split_{}_batchsize_{}".format(split_ratio, batch_size)) 117 | 118 | model.fit(X_train, Y_train, batch_size=batch_size, 119 | nb_epoch=nb_epoch, verbose=1, callbacks=[tensorboard] ) 120 | 121 | # model.save('model_hin_tel_38_samples.h5') 122 | 123 | y_preds = model.predict(X_test) 124 | for i in range(len(y_preds)): 125 | print(y_preds[i], y_test[i]) 126 | 127 | score = model.evaluate(X_test, Y_test, verbose=1) 128 | print(score) 129 | print("\n**********************************\n") 130 | 131 | # print(classification_report(Y_test, Y_preds)) 132 | -------------------------------------------------------------------------------- /classification/conv_1d_model_aws.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import numpy as np 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.metrics import classification_report 6 | np.random.seed(1337) # for reproducibility 7 | 8 | from keras.preprocessing import sequence 9 | from keras.layers.noise import GaussianNoise 10 | from keras.models import Sequential 11 | from keras.layers.core import Dense, Dropout, Activation, Flatten 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, AveragePooling1D 14 | from keras.utils import np_utils 15 | 16 | 17 | # set parameters: 18 | test_dim = 999 19 | maxlen = 100 20 | batch_size = 50 21 | nb_filter = 512 22 | filter_length_1 = 100 23 | filter_length_2 = 30 24 | filter_length_3 = 15 25 | hidden_dims = 10 26 | nb_epoch = 5 27 | nb_classes = 3 28 | 29 | print('Loading data...') 30 | X = np.load('top_3_100_split_mfcc.npy') 31 | y = np.load('top_3_100_split_y.npy') 32 | 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) 34 | 35 | # in case the passed in data is 2d and not 3d 36 | ''' 37 | xts = X_train.shape 38 | X_train = np.reshape(X_train, (xts[0], xts[1], 1)) 39 | xtss = X_test.shape 40 | X_test = np.reshape(X_test, (xtss[0], xtss[1], 1)) 41 | yts = y_train.shape 42 | y_train = np.reshape(y_train, (yts[0], 1)) 43 | ytss = y_test.shape 44 | y_test = np.reshape(y_test, (ytss[0], 1)) 45 | ''' 46 | 47 | print(len(X_train), 'train sequences') 48 | print(len(X_test), 'test sequences') 49 | 50 | Y_train = np_utils.to_categorical(y_train, nb_classes) 51 | Y_test = np_utils.to_categorical(y_test, nb_classes) 52 | 53 | 54 | print('Build model...') 55 | model = Sequential() 56 | 57 | # we add a Convolution1D, which will learn nb_filter mfcc groups: 58 | model.add(Convolution1D(nb_filter=nb_filter, 59 | filter_length=filter_length_1, 60 | input_shape=(test_dim, 13), 61 | init = 'glorot_normal', 62 | border_mode='valid', 63 | activation='relu' 64 | )) 65 | 66 | # batch normalization to keep weights in the 0 to 1 range 67 | model.add(BatchNormalization()) 68 | 69 | # add more layers 70 | model.add(Convolution1D(nb_filter=nb_filter, 71 | filter_length=filter_length_2, 72 | border_mode='valid', 73 | activation='relu' 74 | )) 75 | 76 | model.add(BatchNormalization()) 77 | 78 | # we use standard max pooling (halving the output of the previous layer) 79 | model.add(MaxPooling1D(pool_length=2)) 80 | 81 | 82 | model.add(Convolution1D(nb_filter=nb_filter, 83 | filter_length=filter_length_2, 84 | border_mode='valid', 85 | activation='relu' 86 | )) 87 | 88 | model.add(BatchNormalization()) 89 | 90 | model.add(MaxPooling1D(pool_length=2)) 91 | 92 | model.add(Convolution1D(nb_filter=nb_filter, 93 | filter_length=filter_length_2, 94 | border_mode='valid', 95 | activation='relu' 96 | )) 97 | 98 | model.add(BatchNormalization()) 99 | 100 | model.add(MaxPooling1D(pool_length=2)) 101 | 102 | # Dropout reduces overfitting 103 | model.add(Dropout(.1)) 104 | 105 | model.add(Convolution1D(nb_filter=nb_filter, 106 | filter_length=filter_length_2, 107 | border_mode='valid', 108 | activation='relu' 109 | )) 110 | 111 | model.add(BatchNormalization()) 112 | 113 | model.add(MaxPooling1D(pool_length=2)) 114 | 115 | model.add(Dropout(.1)) 116 | 117 | model.add(Convolution1D(nb_filter=nb_filter, 118 | filter_length=filter_length_3, 119 | border_mode='valid', 120 | activation='relu' 121 | )) 122 | 123 | model.add(BatchNormalization()) 124 | 125 | model.add(MaxPooling1D(pool_length=2)) 126 | 127 | # We flatten the output of the conv layer, 128 | # so that we can add a vanilla dense layer: 129 | model.add(Flatten()) 130 | 131 | # We project onto a single unit output layer, and squash it with a softmax into 0-1 probability space: 132 | model.add(Dense(nb_classes)) 133 | model.add(Activation('softmax')) 134 | 135 | model.compile(loss='categorical_crossentropy', 136 | optimizer='adam', metrics = ["accuracy"]) 137 | model.fit(X_train, Y_train, batch_size=batch_size, 138 | nb_epoch=nb_epoch, verbose=1, 139 | validation_data=(X_test, Y_test)) 140 | 141 | # print report of recall, precision, f1 score 142 | y_pred = model.predict_classes(X_test) 143 | print(classification_report(y_test, y_pred)) 144 | -------------------------------------------------------------------------------- /classification/conv_1d_model_run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gen_y import generate_y 3 | from sklearn.cross_validation import train_test_split 4 | from sklearn.metrics import classification_report 5 | from keras.models import load_model 6 | from keras.utils import np_utils 7 | 8 | nb_classes = 3 9 | 10 | X_test = np.load('test_mfcc_merge_spanish_test.npy') 11 | #print(X.shape) 12 | # y = generate_y('/media/enigmaeth/My Passport/Datasets/Accent/sounds_wav') 13 | #print(y.shape) 14 | 15 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) 16 | "" 17 | # Y_train = np_utils.to_categorical(y_train, nb_classes) 18 | # Y_test = np_utils.to_categorical(y_test, nb_classes) 19 | 20 | #X_test = np.load('test_mfcc.npy') 21 | print(X_test.shape) 22 | y = np.array([2]*380) 23 | #print(y) 24 | Y_test = np_utils.to_categorical(y, nb_classes) 25 | print(Y_test.shape) 26 | 27 | model = load_model('model_20epochs.h5') 28 | prediction = model.predict(X_test) 29 | ct = 0 30 | ans = [] 31 | for row in prediction: 32 | index = idx = 0 33 | ma = 0 34 | for col in row: 35 | if col > ma: 36 | idx = index 37 | ma = col 38 | index += 1 39 | #if ct == 10: break 40 | ct += 1 41 | ans.append(idx) 42 | 43 | print(ans, len(ans)) 44 | 45 | score = model.evaluate(X_test, Y_test, verbose=1) 46 | print(score) -------------------------------------------------------------------------------- /classification/log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/classification/log.txt -------------------------------------------------------------------------------- /classification/logs/split_0.15_batchsize_10/events.out.tfevents.1569179240.elem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/classification/logs/split_0.15_batchsize_10/events.out.tfevents.1569179240.elem -------------------------------------------------------------------------------- /classification/logs/split_0.15_batchsize_10/events.out.tfevents.1569217478.elem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/classification/logs/split_0.15_batchsize_10/events.out.tfevents.1569217478.elem -------------------------------------------------------------------------------- /classification/lstm_cnn.py: -------------------------------------------------------------------------------- 1 | '''classification task. 2 | Gets to 0.8498 test accuracy after 2 epochs. 41s/epoch on K520 GPU. 3 | ''' 4 | from __future__ import print_function 5 | 6 | from keras.preprocessing import sequence 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Dropout, Activation 9 | from keras.layers import Embedding 10 | from keras.layers import LSTM 11 | from keras.layers import Conv1D, MaxPooling1D 12 | from keras.datasets import imdb 13 | 14 | # Embedding 15 | max_features = 20000 16 | maxlen = 100 17 | embedding_size = 128 18 | 19 | # Convolution 20 | kernel_size = 5 21 | filters = 64 22 | pool_size = 4 23 | 24 | # LSTM 25 | lstm_output_size = 70 26 | 27 | # Training 28 | batch_size = 30 29 | epochs = 2 30 | 31 | ''' 32 | Note: 33 | batch_size is highly sensitive. 34 | Only 2 epochs are needed as the dataset is very small. 35 | ''' 36 | 37 | print('Loading data...') 38 | (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) 39 | print(len(x_train), 'train sequences') 40 | print(len(x_test), 'test sequences') 41 | 42 | print('Pad sequences (samples x time)') 43 | x_train = sequence.pad_sequences(x_train, maxlen=maxlen) 44 | x_test = sequence.pad_sequences(x_test, maxlen=maxlen) 45 | print('x_train shape:', x_train.shape) 46 | print('x_test shape:', x_test.shape) 47 | 48 | print('Build model...') 49 | 50 | model = Sequential() 51 | model.add(Embedding(max_features, embedding_size, input_length=maxlen)) 52 | model.add(Dropout(0.25)) 53 | model.add(Conv1D(filters, 54 | kernel_size, 55 | padding='valid', 56 | activation='relu', 57 | strides=1)) 58 | model.add(MaxPooling1D(pool_size=pool_size)) 59 | model.add(LSTM(lstm_output_size)) 60 | model.add(Dense(1)) 61 | model.add(Activation('sigmoid')) 62 | 63 | model.compile(loss='binary_crossentropy', 64 | optimizer='adam', 65 | metrics=['accuracy']) 66 | 67 | print('Train...') 68 | model.fit(x_train, y_train, 69 | batch_size=batch_size, 70 | epochs=epochs, 71 | validation_data=(x_test, y_test)) 72 | score, acc = model.evaluate(x_test, y_test, batch_size=batch_size) 73 | print('Test score:', score) 74 | print('Test accuracy:', acc) 75 | -------------------------------------------------------------------------------- /classification/model_5epochs_rnn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/classification/model_5epochs_rnn.h5 -------------------------------------------------------------------------------- /classification/model_hin_tel_38_samples.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/classification/model_hin_tel_38_samples.h5 -------------------------------------------------------------------------------- /classification/rnn_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | from .attention_lstm import goo 5 | from keras.optimizers import SGD 6 | 7 | 8 | np.random.seed(1337) # for reproducibility 9 | from keras.preprocessing import sequence 10 | from keras.utils import np_utils 11 | from keras.models import Sequential 12 | from keras.layers.core import Dense, Dropout, Activation 13 | from keras.layers.recurrent import LSTM 14 | from sklearn.cross_validation import train_test_split 15 | from sklearn.metrics import classification_report 16 | from gen_y import generate_y 17 | 18 | hidden_units = 100 19 | nb_classes = 2 20 | print('Loading data...') 21 | X = np.load('x_test_mfcc_split_wav_30sec.npy') 22 | y = generate_y('/media/enigmaeth/My Passport/Datasets/linguistics data/split_wav_30sec') 23 | 24 | X = X[:200] 25 | y = y[:200] 26 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 27 | batch_size = 20 28 | 29 | split__ = int((len(X_train)//batch_size)*batch_size) 30 | X_train = X_train[:split__] 31 | y_train = y_train[:split__] 32 | 33 | 34 | print(len(X_train), 'train sequences') 35 | print(len(X_test), 'test sequences') 36 | print('X_train shape:', X_train.shape) 37 | print('X_test shape:', X_test.shape) 38 | print('y_train shape:', y_train.shape) 39 | print('y_test shape:', y_test.shape) 40 | print('Build model...') 41 | 42 | print(y_train) 43 | Y_train = np_utils.to_categorical(y_train, nb_classes) 44 | Y_test = np_utils.to_categorical(y_test, nb_classes) 45 | 46 | model = Sequential() 47 | 48 | #batch_input_shape= (batch_size, X_train.shape[1], X_train.shape[2]) 49 | 50 | # note that it is necessary to pass in 3d batch_input_shape if stateful=True 51 | model.add(LSTM(64, return_sequences=True, stateful=False, 52 | batch_input_shape= (batch_size, X_train.shape[1], X_train.shape[2]))) 53 | model.add(LSTM(64, return_sequences=True, stateful=False)) 54 | model.add(LSTM(64, stateful=False)) 55 | 56 | 57 | # add dropout to control for overfitting 58 | model.add(Dropout(.25)) 59 | 60 | # squash output onto number of classes in probability space 61 | model.add(Dense(nb_classes, activation='softmax')) 62 | 63 | 64 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"]) 65 | 66 | print("Train...") 67 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=5, validation_data=(X_test, Y_test)) 68 | 69 | y_pred=model.predict_classes(X_test, batch_size=batch_size) 70 | print(classification_report(y_test, y_pred)) 71 | 72 | model.save('model_5epochs_rnn.h5') 73 | -------------------------------------------------------------------------------- /classification/testing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from python_speech_features import mfcc 4 | from python_speech_features import logfbank 5 | import scipy.io.wavfile as wav 6 | from scipy.io.wavfile import write as wav_write 7 | import librosa 8 | import scipy 9 | from tqdm import tqdm 10 | # import scikits.samplerate 11 | import os 12 | 13 | 14 | ''' 15 | mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True) 16 | ''' 17 | # read in wav file, get out signal (np array) and sampling rate (int) 18 | def read_in_audio(filename): 19 | (rate, sig) = wav.read(filename) 20 | return sig, rate 21 | 22 | 23 | # read in signal, take absolute value and slice seconds 1-3 from beginning 24 | def get_two_secs(filename): 25 | sig, rate = read_in_audio(filename) 26 | abs_sig = np.abs(sig) 27 | two_secs = abs_sig[rate:3*rate] 28 | return two_secs 29 | 30 | # calculates moving average for a specified window (number of samples) 31 | def take_moving_average(sig, window_width): 32 | cumsum_vec = np.cumsum(np.insert(sig, 0, 0)) 33 | ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width])/float(window_width) 34 | return ma_vec 35 | 36 | # read in signal, change sample rate to outrate (samples/sec), use write_wav=True to save wav file to disk 37 | def downsample(filename, outrate=8000, write_wav = False): 38 | print(filename) 39 | (rate, sig) = wav.read(filename) 40 | down_sig = librosa.core.resample(sig * 1., rate, outrate, scale=True) 41 | if not write_wav: 42 | return down_sig, outrate 43 | if write_wav: 44 | wav_write('{}_down_{}.wav'.format(filename, outrate), outrate, down_sig) 45 | 46 | def librosa_downsample(filename, outrate=8000): 47 | y, s = librosa.load(filename, sr=8000) 48 | return y, s 49 | 50 | def custom_downsample(filename, outrate=8000): 51 | (rate, sig) = wav.read(filename) 52 | len_in_secs = len(sig) 53 | secs = len_in_secs/rate # Number of seconds in signal X 54 | samps = secs*outrate # Number of samples to downsample 55 | print(secs, samps) 56 | Y = scipy.signal.resample(sig , int(samps)) 57 | return Y, outrate 58 | 59 | # change total number of samps for downsampled file to n_samps by trimming or zero-padding and standardize them 60 | def make_standard_length(filename, n_samps=240000): 61 | down_sig, rate = librosa_downsample(filename) 62 | normed_sig = librosa.util.fix_length(down_sig, n_samps) 63 | normed_sig = (normed_sig - np.mean(normed_sig))/np.std(normed_sig) 64 | return normed_sig 65 | 66 | # from a folder containing wav files, normalize each, divide into num_splits-1 chunks and write the resulting np.arrays to a single matrix 67 | def make_split_audio_array(folder, num_splits = 5): 68 | """ 69 | returns numpy array of split audio for a folder 70 | """ 71 | lst = [] 72 | for filename in tqdm(os.listdir(folder)) : 73 | if filename.endswith('wav'): 74 | normed_sig = make_standard_length(filename) 75 | chunk = normed_sig.shape[0]/num_splits 76 | for i in range(num_splits - 1): 77 | lst.append(normed_sig[i*chunk:(i+2)*chunk]) 78 | lst = np.array(lst) 79 | lst = lst.reshape(lst.shape[0], -1) 80 | return lst 81 | 82 | # for input wav file outputs (13, 2999) mfcc np array 83 | def make_normed_mfcc(filename, outrate=8000): 84 | normed_sig = make_standard_length(filename) 85 | normed_mfcc_feat = mfcc(normed_sig, outrate) 86 | normed_mfcc_feat = normed_mfcc_feat.T 87 | return normed_mfcc_feat 88 | 89 | # make mfcc np array from wav file using librosa package 90 | def make_librosa_mfcc(filename): 91 | y, sr = librosa.load(filename) 92 | mfcc_feat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) 93 | return mfcc_feat 94 | 95 | # make mfcc np array from wav file using speech features package 96 | def make_mfcc(filename): 97 | (rate, sig) = wav.read(filename) 98 | mfcc_feat = mfcc(sig, rate) 99 | mfcc_feat = mfcc_feat.T 100 | return mfcc_feat 101 | 102 | # for folder containing wav files, output numpy array of normed mfcc 103 | def make_class_array(folder): 104 | lst = [] 105 | files = os.listdir(folder) 106 | count_files = len(files) 107 | for idx, file_path in tqdm(enumerate(files)): 108 | filename = os.path.join(folder, file_path) 109 | lst.append(make_normed_mfcc(filename)) 110 | class_array = np.array(lst) 111 | class_array = np.reshape(class_array, (class_array.shape[0], class_array.shape[2], class_array.shape[1])) 112 | return class_array 113 | 114 | # read in wav file, output (1,13) numpy array of mean mfccs for each of 13 features 115 | def make_mean_mfcc(filename): 116 | try: 117 | (rate, sig) = wav.read(filename) 118 | mfcc_feat = mfcc(sig, rate) 119 | avg_mfcc = np.mean(mfcc_feat, axis = 0) 120 | return avg_mfcc 121 | except: 122 | pass 123 | 124 | # write new csv corresponding to dataframe of given language and gender 125 | def make_df_language_gender(df, language, gender): 126 | newdf = df.query("native_language == @language").query("sex == @gender") 127 | newdf.to_csv('df_{}_{}.csv'.format(language, gender)) 128 | 129 | # write new directories to disk containing the male and female speakers from the most common languages 130 | def make_folders_from_csv(): 131 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 132 | for lang in top_15_langs: 133 | os.makedirs('{}/{}_male'.format(lang, lang)) 134 | os.makedirs('{}/{}_female'.format(lang, lang)) 135 | 136 | # copy files to the corresponding directories 137 | def copy_files_from_csv(): 138 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 139 | for lang in top_15_langs: 140 | df_male = pd.read_csv('df_{}_male.csv'.format(lang)) 141 | df_female = pd.read_csv('df_{}_female.csv'.format(lang)) 142 | m_list = df_male['filename'].values 143 | f_list = df_female['filename'].values 144 | for filename in f_list: 145 | shutil.copy2('big_langs/{}/{}.wav'.format(lang, filename), 'big_langs/{}/{}_female/{}.wav'.format(lang, lang, filename)) 146 | 147 | # input folder of wav files, output pandas dataframe of mean mfcc values 148 | def make_mean_mfcc_df(folder): 149 | norms = [] 150 | for file_path in os.listdir(folder): 151 | filename = os.path.join(folder, file_path) 152 | (rate, sig) = wav.read(filename) 153 | mfcc_feat = mfcc(sig, rate) 154 | mean_mfcc = np.mean(mfcc_feat, axis = 0) 155 | #mean_mfcc = np.reshape(mean_mfcc, (1,13)) 156 | norms.append(mean_mfcc) 157 | flat = [a.ravel() for a in norms] 158 | stacked = np.vstack(flat) 159 | df = pd.DataFrame(stacked) 160 | return df 161 | -------------------------------------------------------------------------------- /classification/testing.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/classification/testing.pyc -------------------------------------------------------------------------------- /data/all_accents/accent_trim_gen_x.py: -------------------------------------------------------------------------------- 1 | from mfcc import * 2 | import numpy as np 3 | 4 | folder = '/home/enigmaeth/accentPhase2/data/all_accents/all_accents_trim' 5 | 6 | x = make_class_array(folder) 7 | print(x.shape) 8 | X_file = '/home/enigmaeth/accentPhase2/data/numpy_vectors/x_' + (folder.split('/'))[-1] 9 | 10 | print("saving labels to ", X_file) 11 | np.save(X_file, x) 12 | 13 | 14 | 15 | # filename = "english1.wav" 16 | 17 | # with open(filename, 'rb') as f: 18 | # print(read_in_audio(f)) 19 | 20 | # cd = make_class_array('/media/enigmaeth/My Passport/Datasets/Accent/clean_data') 21 | # print(cd.shape) 22 | # np.save('top_3_100_split_mfcc.npy', cd) 23 | # mf = make_mean_mfcc_df('/media/enigmaeth/My Passport/Datasets/Accent/sounds_wav') 24 | # print(mf.shape) 25 | # np.save('top_3_100_split_y.npy', mf) 26 | -------------------------------------------------------------------------------- /data/all_accents/accent_trim_gen_y.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | def generate_y(folder): 5 | accents = {} 6 | counts = {} 7 | y = [] 8 | index = 0 9 | 10 | for filename in os.listdir(folder): 11 | name = ''.join([i for i in filename if not i.isdigit()]) 12 | name = name.split('_')[0] 13 | if name not in accents: 14 | accents[name] = index 15 | index += 1 16 | counts[name] = 0 17 | 18 | counts[name] += 1 19 | y.append(accents[name]) 20 | 21 | print(counts) 22 | print(accents) 23 | 24 | sorted_counts = sorted(counts, key=counts.get, reverse=True) 25 | for r in sorted_counts: 26 | print(r, counts[r]) 27 | 28 | np_y = np.reshape(np.array(y), (len(y), 1)) 29 | 30 | Y_file = '/home/enigmaeth/accentPhase2/data/numpy_vectors/y_'+ (folder.split('/'))[-1] 31 | print("saving labels to ", Y_file) 32 | np.save(Y_file, y) 33 | 34 | folder = "/home/enigmaeth/accentPhase2/data/all_accents/all_accents_trim" 35 | generate_y(folder) -------------------------------------------------------------------------------- /data/all_accents/all_accents_split.log.save: -------------------------------------------------------------------------------- 1 | Splitting Bangla_Arc.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 20134/20143 [00:43<00:00, 459.96it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 778/778 [00:13<00:00, 56.96it/s] Splitting Bangla_Jay.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19824/19833 [00:44<00:00, 447.85it/s] 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 750/750 [00:06<00:00, 110.16it/s] Splitting Malayalam_Hab.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19902/19911 [01:01<00:00, 323.10it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [00:08<00:00, 84.67it/s] Splitting Malayalam_Sal.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19969/19978 [00:59<00:00, 334.94it/s] 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:11<00:00, 2.54it/s] Splitting Malayalam_Sha.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 23904/23913 [01:06<00:00, 357.02it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 895/895 [00:23<00:00, 38.31it/s] Splitting Odiya_Suc.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19675/19684 [01:06<00:00, 294.34it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:09<00:00, 12.26it/s] Splitting Telugu_Nav.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19554/19563 [00:54<00:00, 356.87it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 766/766 [00:09<00:00, 83.00it/s] Splitting Telugu_Tho.wav where energy is below 1.0% for longer than 2.0s. 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 19347/19356 [01:06<00:00, 291.94it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 749/749 [00:07<00:00, 95.36it/s] -------------------------------------------------------------------------------- /data/all_accents/all_accents_trim.sh: -------------------------------------------------------------------------------- 1 | cd all_accents_out 2 | for i in *.wav  ✔  ⚙  00:54:25 3 | do 4 | sox "$i" ../all_accents_trim/"$i" trim 0 300 5 | done 6 | -------------------------------------------------------------------------------- /data/all_accents/alt_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from scipy.io import wavfile 4 | import os 5 | import numpy as np 6 | import argparse 7 | from tqdm import tqdm 8 | 9 | # Utility functions 10 | xrange = range 11 | def windows(signal, window_size, step_size): 12 | if type(window_size) is not int: 13 | raise AttributeError("Window size must be an integer.") 14 | if type(step_size) is not int: 15 | raise AttributeError("Step size must be an integer.") 16 | for i_start in xrange(0, len(signal), step_size): 17 | i_end = i_start + window_size 18 | if i_end >= len(signal): 19 | break 20 | yield signal[i_start:i_end] 21 | 22 | def energy(samples): 23 | return np.sum(np.power(samples, 2.)) / float(len(samples)) 24 | 25 | def rising_edges(binary_signal): 26 | previous_value = 0 27 | index = 0 28 | for x in binary_signal: 29 | if x and not previous_value: 30 | yield index 31 | previous_value = x 32 | index += 1 33 | 34 | # Process command line arguments 35 | 36 | parser = argparse.ArgumentParser(description='Split a WAV file at silence.') 37 | parser.add_argument('input_file', type=str, help='The WAV file to split.') 38 | parser.add_argument('--output-dir', '-o', type=str, default='.', help='The output folder. Defaults to the current folder.') 39 | parser.add_argument('--min-silence-length', '-m', type=float, default=3., help='The minimum length of silence at which a split may occur [seconds]. Defaults to 3 seconds.') 40 | parser.add_argument('--silence-threshold', '-t', type=float, default=1e-6, help='The energy level (between 0.0 and 1.0) below which the signal is regarded as silent. Defaults to 1e-6 == 0.0001%.') 41 | parser.add_argument('--step-duration', '-s', type=float, default=None, help='The amount of time to step forward in the input file after calculating energy. Smaller value = slower, but more accurate silence detection. Larger value = faster, but might miss some split opportunities. Defaults to (min-silence-length / 10.).') 42 | parser.add_argument('--dry-run', '-n', action='store_true', help='Don\'t actually write any output files.') 43 | 44 | args = parser.parse_args() 45 | 46 | input_filename = args.input_file 47 | window_duration = args.min_silence_length 48 | if args.step_duration is None: 49 | step_duration = window_duration / 10. 50 | else: 51 | step_duration = args.step_duration 52 | silence_threshold = args.silence_threshold 53 | output_dir = args.output_dir 54 | output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0] 55 | dry_run = args.dry_run 56 | 57 | print(f"Splitting {input_filename} where energy is below {silence_threshold * 100}% for longer than {window_duration}s.") 58 | 59 | sample_rate, samples = input_data=wavfile.read(filename=input_filename, mmap=True) 60 | 61 | max_amplitude = np.iinfo(samples.dtype).max 62 | max_energy = energy([max_amplitude]) 63 | 64 | window_size = int(window_duration * sample_rate) 65 | step_size = int(step_duration * sample_rate) 66 | 67 | signal_windows = windows( 68 | signal=samples, 69 | window_size=window_size, 70 | step_size=step_size 71 | ) 72 | 73 | window_energy = (energy(w) / max_energy for w in tqdm( 74 | signal_windows, 75 | total=int(len(samples) / float(step_size)) 76 | )) 77 | 78 | window_silence = (e > silence_threshold for e in window_energy) 79 | 80 | cut_times = (r * step_duration for r in rising_edges(window_silence)) 81 | 82 | # This is the step that takes long, since we force the generators to run. 83 | cut_samples = [int(t * sample_rate) for t in cut_times] 84 | cut_samples.append(-1) 85 | 86 | cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in xrange(len(cut_samples) - 1)] 87 | 88 | for i, start, stop in tqdm(cut_ranges): 89 | output_file_path = "{}_{:03d}.wav".format( 90 | os.path.join(output_dir, output_filename_prefix), 91 | i 92 | ) 93 | if not dry_run: 94 | wavfile.write( 95 | filename=output_file_path, 96 | rate=sample_rate, 97 | data=samples[start:stop] 98 | ) 99 | -------------------------------------------------------------------------------- /data/all_accents/mfcc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from python_speech_features import mfcc 4 | from python_speech_features import logfbank 5 | import scipy.io.wavfile as wav 6 | from scipy.io.wavfile import write as wav_write 7 | import librosa 8 | import scipy 9 | from tqdm import tqdm 10 | # import scikits.samplerate 11 | import os 12 | 13 | 14 | ''' 15 | mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True) 16 | ''' 17 | # read in wav file, get out signal (np array) and sampling rate (int) 18 | def read_in_audio(filename): 19 | (rate, sig) = wav.read(filename) 20 | return sig, rate 21 | 22 | 23 | # read in signal, take absolute value and slice seconds 1-3 from beginning 24 | def get_two_secs(filename): 25 | sig, rate = read_in_audio(filename) 26 | abs_sig = np.abs(sig) 27 | two_secs = abs_sig[rate:3*rate] 28 | return two_secs 29 | 30 | # calculates moving average for a specified window (number of samples) 31 | def take_moving_average(sig, window_width): 32 | cumsum_vec = np.cumsum(np.insert(sig, 0, 0)) 33 | ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width])/float(window_width) 34 | return ma_vec 35 | 36 | # read in signal, change sample rate to outrate (samples/sec), use write_wav=True to save wav file to disk 37 | def downsample(filename, outrate=8000, write_wav = False): 38 | print(filename) 39 | (rate, sig) = wav.read(filename) 40 | down_sig = librosa.core.resample(sig * 1., rate, outrate, scale=True) 41 | if not write_wav: 42 | return down_sig, outrate 43 | if write_wav: 44 | wav_write('{}_down_{}.wav'.format(filename, outrate), outrate, down_sig) 45 | 46 | def librosa_downsample(filename, outrate=8000): 47 | y, s = librosa.load(filename, sr=8000) 48 | return y, s 49 | 50 | # change total number of samps for downsampled file to n_samps by trimming or zero-padding and standardize them 51 | def make_standard_length(filename): 52 | down_sig, rate = librosa_downsample(filename) 53 | normed_sig = librosa.util.fix_length(down_sig, 40000) 54 | # start = 1 * 8000 55 | # end = 4 * 8000 56 | # normed_sig = down_sig[start:end] 57 | normed_sig = (normed_sig - np.mean(normed_sig))/np.std(normed_sig) 58 | return normed_sig 59 | 60 | # from a folder containing wav files, normalize each, divide into num_splits-1 chunks and write the resulting np.arrays to a single matrix 61 | def make_split_audio_array(folder, num_splits = 5): 62 | """ 63 | returns numpy array of split audio for a folder 64 | """ 65 | lst = [] 66 | for filename in tqdm(os.listdir(folder)) : 67 | if filename.endswith('wav'): 68 | normed_sig = make_standard_length(filename) 69 | chunk = normed_sig.shape[0]/num_splits 70 | for i in range(num_splits - 1): 71 | lst.append(normed_sig[i*chunk:(i+2)*chunk]) 72 | lst = np.array(lst) 73 | lst = lst.reshape(lst.shape[0], -1) 74 | return lst 75 | 76 | # for input wav file outputs (13, 2999) mfcc np array 77 | def make_normed_mfcc(filename, outrate=8000): 78 | normed_sig = make_standard_length(filename) 79 | normed_mfcc_feat = mfcc(normed_sig, outrate) 80 | normed_mfcc_feat = normed_mfcc_feat.T 81 | return normed_mfcc_feat 82 | 83 | # make mfcc np array from wav file using librosa package 84 | def make_librosa_mfcc(filename): 85 | y, sr = librosa.load(filename) 86 | mfcc_feat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) 87 | return mfcc_feat 88 | 89 | # make mfcc np array from wav file using speech features package 90 | def make_mfcc(filename): 91 | (rate, sig) = wav.read(filename) 92 | mfcc_feat = mfcc(sig, rate) 93 | mfcc_feat = mfcc_feat.T 94 | return mfcc_feat 95 | 96 | # for folder containing wav files, output numpy array of normed mfcc 97 | def make_class_array(folder): 98 | lst = [] 99 | files = os.listdir(folder) 100 | count_files = len(files) 101 | for file_path in tqdm(files): 102 | filename = os.path.join(folder, file_path) 103 | lst.append(make_normed_mfcc(filename)) 104 | class_array = np.array(lst) 105 | class_array = np.reshape(class_array, (class_array.shape[0], class_array.shape[2], class_array.shape[1])) 106 | return class_array 107 | 108 | # read in wav file, output (1,13) numpy array of mean mfccs for each of 13 features 109 | def make_mean_mfcc(filename): 110 | try: 111 | (rate, sig) = wav.read(filename) 112 | mfcc_feat = mfcc(sig, rate) 113 | avg_mfcc = np.mean(mfcc_feat, axis = 0) 114 | return avg_mfcc 115 | except: 116 | pass 117 | 118 | # write new csv corresponding to dataframe of given language and gender 119 | def make_df_language_gender(df, language, gender): 120 | newdf = df.query("native_language == @language").query("sex == @gender") 121 | newdf.to_csv('df_{}_{}.csv'.format(language, gender)) 122 | 123 | # write new directories to disk containing the male and female speakers from the most common languages 124 | def make_folders_from_csv(): 125 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 126 | for lang in top_15_langs: 127 | os.makedirs('{}/{}_male'.format(lang, lang)) 128 | os.makedirs('{}/{}_female'.format(lang, lang)) 129 | 130 | # copy files to the corresponding directories 131 | def copy_files_from_csv(): 132 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 133 | for lang in top_15_langs: 134 | df_male = pd.read_csv('df_{}_male.csv'.format(lang)) 135 | df_female = pd.read_csv('df_{}_female.csv'.format(lang)) 136 | m_list = df_male['filename'].values 137 | f_list = df_female['filename'].values 138 | for filename in f_list: 139 | shutil.copy2('big_langs/{}/{}.wav'.format(lang, filename), 'big_langs/{}/{}_female/{}.wav'.format(lang, lang, filename)) 140 | 141 | # input folder of wav files, output pandas dataframe of mean mfcc values 142 | def make_mean_mfcc_df(folder): 143 | norms = [] 144 | for file_path in os.listdir(folder): 145 | filename = os.path.join(folder, file_path) 146 | (rate, sig) = wav.read(filename) 147 | mfcc_feat = mfcc(sig, rate) 148 | mean_mfcc = np.mean(mfcc_feat, axis = 0) 149 | #mean_mfcc = np.reshape(mean_mfcc, (1,13)) 150 | norms.append(mean_mfcc) 151 | flat = [a.ravel() for a in norms] 152 | stacked = np.vstack(flat) 153 | df = pd.DataFrame(stacked) 154 | return df 155 | -------------------------------------------------------------------------------- /data/all_accents/split_to_wav.sh: -------------------------------------------------------------------------------- 1 | for file in *.wav 2 | do 3 | python3 alt_split.py "$file" --output-dir all_accents_out/ --min-silence-length=2 --silence-threshold=0.01 4 | done 5 | -------------------------------------------------------------------------------- /data/folder_structure: -------------------------------------------------------------------------------- 1 | data [error opening dir] 2 | 3 | 0 directories, 0 files 4 | -------------------------------------------------------------------------------- /data/non_trained_accents/accent_trim_gen_x.py: -------------------------------------------------------------------------------- 1 | from mfcc import * 2 | import numpy as np 3 | 4 | folder = '/home/enigmaeth/accentPhase2/data/non_trained_accents/bangla' 5 | 6 | x = make_class_array(folder) 7 | print(x.shape) 8 | X_file = '/home/enigmaeth/accentPhase2/data/non_trained_accents/numpy_vectors/x_' + (folder.split('/'))[-1] 9 | 10 | print("saving labels to ", X_file) 11 | np.save(X_file, x) 12 | 13 | 14 | 15 | # filename = "english1.wav" 16 | 17 | # with open(filename, 'rb') as f: 18 | # print(read_in_audio(f)) 19 | 20 | # cd = make_class_array('/media/enigmaeth/My Passport/Datasets/Accent/clean_data') 21 | # print(cd.shape) 22 | # np.save('top_3_100_split_mfcc.npy', cd) 23 | # mf = make_mean_mfcc_df('/media/enigmaeth/My Passport/Datasets/Accent/sounds_wav') 24 | # print(mf.shape) 25 | # np.save('top_3_100_split_y.npy', mf) 26 | -------------------------------------------------------------------------------- /data/non_trained_accents/accent_trim_gen_y.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | def generate_y(folder): 5 | accents = {} 6 | counts = {} 7 | y = [] 8 | index = 0 9 | 10 | for filename in os.listdir(folder): 11 | name = ''.join([i for i in filename if not i.isdigit()]) 12 | name = name.split('_')[0] 13 | if name not in accents: 14 | accents[name] = index 15 | index += 1 16 | counts[name] = 0 17 | 18 | counts[name] += 1 19 | y.append(accents[name]+1) 20 | 21 | print(counts) 22 | print(accents) 23 | 24 | sorted_counts = sorted(counts, key=counts.get, reverse=True) 25 | for r in sorted_counts: 26 | print(r, counts[r]) 27 | 28 | np_y = np.reshape(np.array(y), (len(y), 1)) 29 | 30 | Y_file = '/home/enigmaeth/accentPhase2/data/non_trained_accents/numpy_vectors/y_'+ (folder.split('/'))[-1] 31 | print("saving labels to ", Y_file) 32 | np.save(Y_file, y) 33 | 34 | folder = "/home/enigmaeth/accentPhase2/data/non_trained_accents/bangla" 35 | generate_y(folder) -------------------------------------------------------------------------------- /data/non_trained_accents/mfcc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from python_speech_features import mfcc 4 | from python_speech_features import logfbank 5 | import scipy.io.wavfile as wav 6 | from scipy.io.wavfile import write as wav_write 7 | import librosa 8 | import scipy 9 | from tqdm import tqdm 10 | # import scikits.samplerate 11 | import os 12 | 13 | 14 | ''' 15 | mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True) 16 | ''' 17 | # read in wav file, get out signal (np array) and sampling rate (int) 18 | def read_in_audio(filename): 19 | (rate, sig) = wav.read(filename) 20 | return sig, rate 21 | 22 | 23 | # read in signal, take absolute value and slice seconds 1-3 from beginning 24 | def get_two_secs(filename): 25 | sig, rate = read_in_audio(filename) 26 | abs_sig = np.abs(sig) 27 | two_secs = abs_sig[rate:3*rate] 28 | return two_secs 29 | 30 | # calculates moving average for a specified window (number of samples) 31 | def take_moving_average(sig, window_width): 32 | cumsum_vec = np.cumsum(np.insert(sig, 0, 0)) 33 | ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width])/float(window_width) 34 | return ma_vec 35 | 36 | # read in signal, change sample rate to outrate (samples/sec), use write_wav=True to save wav file to disk 37 | def downsample(filename, outrate=8000, write_wav = False): 38 | print(filename) 39 | (rate, sig) = wav.read(filename) 40 | down_sig = librosa.core.resample(sig * 1., rate, outrate, scale=True) 41 | if not write_wav: 42 | return down_sig, outrate 43 | if write_wav: 44 | wav_write('{}_down_{}.wav'.format(filename, outrate), outrate, down_sig) 45 | 46 | def librosa_downsample(filename, outrate=8000): 47 | y, s = librosa.load(filename, sr=8000) 48 | return y, s 49 | 50 | # change total number of samps for downsampled file to n_samps by trimming or zero-padding and standardize them 51 | def make_standard_length(filename): 52 | down_sig, rate = librosa_downsample(filename) 53 | normed_sig = librosa.util.fix_length(down_sig, 40000) 54 | # start = 1 * 8000 55 | # end = 4 * 8000 56 | # normed_sig = down_sig[start:end] 57 | normed_sig = (normed_sig - np.mean(normed_sig))/np.std(normed_sig) 58 | return normed_sig 59 | 60 | # from a folder containing wav files, normalize each, divide into num_splits-1 chunks and write the resulting np.arrays to a single matrix 61 | def make_split_audio_array(folder, num_splits = 5): 62 | """ 63 | returns numpy array of split audio for a folder 64 | """ 65 | lst = [] 66 | for filename in tqdm(os.listdir(folder)) : 67 | if filename.endswith('wav'): 68 | normed_sig = make_standard_length(filename) 69 | chunk = normed_sig.shape[0]/num_splits 70 | for i in range(num_splits - 1): 71 | lst.append(normed_sig[i*chunk:(i+2)*chunk]) 72 | lst = np.array(lst) 73 | lst = lst.reshape(lst.shape[0], -1) 74 | return lst 75 | 76 | # for input wav file outputs (13, 2999) mfcc np array 77 | def make_normed_mfcc(filename, outrate=8000): 78 | normed_sig = make_standard_length(filename) 79 | normed_mfcc_feat = mfcc(normed_sig, outrate) 80 | normed_mfcc_feat = normed_mfcc_feat.T 81 | return normed_mfcc_feat 82 | 83 | # make mfcc np array from wav file using librosa package 84 | def make_librosa_mfcc(filename): 85 | y, sr = librosa.load(filename) 86 | mfcc_feat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) 87 | return mfcc_feat 88 | 89 | # make mfcc np array from wav file using speech features package 90 | def make_mfcc(filename): 91 | (rate, sig) = wav.read(filename) 92 | mfcc_feat = mfcc(sig, rate) 93 | mfcc_feat = mfcc_feat.T 94 | return mfcc_feat 95 | 96 | # for folder containing wav files, output numpy array of normed mfcc 97 | def make_class_array(folder): 98 | lst = [] 99 | files = os.listdir(folder) 100 | count_files = len(files) 101 | for file_path in tqdm(files): 102 | filename = os.path.join(folder, file_path) 103 | lst.append(make_normed_mfcc(filename)) 104 | class_array = np.array(lst) 105 | class_array = np.reshape(class_array, (class_array.shape[0], class_array.shape[2], class_array.shape[1])) 106 | return class_array 107 | 108 | # read in wav file, output (1,13) numpy array of mean mfccs for each of 13 features 109 | def make_mean_mfcc(filename): 110 | try: 111 | (rate, sig) = wav.read(filename) 112 | mfcc_feat = mfcc(sig, rate) 113 | avg_mfcc = np.mean(mfcc_feat, axis = 0) 114 | return avg_mfcc 115 | except: 116 | pass 117 | 118 | # write new csv corresponding to dataframe of given language and gender 119 | def make_df_language_gender(df, language, gender): 120 | newdf = df.query("native_language == @language").query("sex == @gender") 121 | newdf.to_csv('df_{}_{}.csv'.format(language, gender)) 122 | 123 | # write new directories to disk containing the male and female speakers from the most common languages 124 | def make_folders_from_csv(): 125 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 126 | for lang in top_15_langs: 127 | os.makedirs('{}/{}_male'.format(lang, lang)) 128 | os.makedirs('{}/{}_female'.format(lang, lang)) 129 | 130 | # copy files to the corresponding directories 131 | def copy_files_from_csv(): 132 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 133 | for lang in top_15_langs: 134 | df_male = pd.read_csv('df_{}_male.csv'.format(lang)) 135 | df_female = pd.read_csv('df_{}_female.csv'.format(lang)) 136 | m_list = df_male['filename'].values 137 | f_list = df_female['filename'].values 138 | for filename in f_list: 139 | shutil.copy2('big_langs/{}/{}.wav'.format(lang, filename), 'big_langs/{}/{}_female/{}.wav'.format(lang, lang, filename)) 140 | 141 | # input folder of wav files, output pandas dataframe of mean mfcc values 142 | def make_mean_mfcc_df(folder): 143 | norms = [] 144 | for file_path in os.listdir(folder): 145 | filename = os.path.join(folder, file_path) 146 | (rate, sig) = wav.read(filename) 147 | mfcc_feat = mfcc(sig, rate) 148 | mean_mfcc = np.mean(mfcc_feat, axis = 0) 149 | #mean_mfcc = np.reshape(mean_mfcc, (1,13)) 150 | norms.append(mean_mfcc) 151 | flat = [a.ravel() for a in norms] 152 | stacked = np.vstack(flat) 153 | df = pd.DataFrame(stacked) 154 | return df 155 | -------------------------------------------------------------------------------- /data/numpy_vectors/conv1d.ipynb - Colaboratory3:1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/data/numpy_vectors/conv1d.ipynb - Colaboratory3:1.pdf -------------------------------------------------------------------------------- /data/numpy_vectors/conv1d.ipynb - Colaboratoryacc99ep10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/data/numpy_vectors/conv1d.ipynb - Colaboratoryacc99ep10.pdf -------------------------------------------------------------------------------- /data/numpy_vectors/conv1d.ipynb - Colaboratoryacc99ep10new.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/data/numpy_vectors/conv1d.ipynb - Colaboratoryacc99ep10new.pdf -------------------------------------------------------------------------------- /data/numpy_vectors/conv1d.ipynb - Colaboratoryacc99ep12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/data/numpy_vectors/conv1d.ipynb - Colaboratoryacc99ep12.pdf -------------------------------------------------------------------------------- /data/numpy_vectors/conv1d.ipynb acc1 splithalf - Colaboratory.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/data/numpy_vectors/conv1d.ipynb acc1 splithalf - Colaboratory.pdf -------------------------------------------------------------------------------- /harvard_sentences.txt: -------------------------------------------------------------------------------- 1 | "The birch canoe slid on the smooth planks.", 2 | "Glue the sheet to the dark blue background", 3 | "It's easy to tell the depth of a well.", 4 | "These days a chicken leg is a rare dish.", 5 | "Rice is often served in round bowls", 6 | "The juice of lemons makes fine punch.", 7 | "The box was thrown beside the parked truck.", 8 | "The hogs were fed chopped corn and garbage.", 9 | "Four hours of steady work faced us.", 10 | "A large size in stockings is hard to sell.", 11 | "The boy was there when the sun rose", 12 | "A rod is used to catch pink salmon.", 13 | "The source of the huge river is the clear spring.", 14 | "Kick the ball straight and follow through", 15 | "Help the woman get back to her feet.", 16 | "A pot of tea helps to pass the evening.", 17 | "Smoky fires lack flame and heat.", 18 | "The soft cushion broke the man's fall.", 19 | "The salt breeze came across from the sea.", 20 | "The girl at the booth sold fifty bonds.", 21 | "The small pup gnawed a hole in the sock.", 22 | "The fish twisted and turned on the bent hook.", 23 | "Press the pants and sew a button on the vest.", 24 | "The swan dive was far short of perfect.", 25 | "The beauty of the view stunned the young boy.", 26 | "Two blue fish swam in the tank.", 27 | "Her purse was full of useless trash.", 28 | "The colt reared and threw the tall rider.", 29 | "It snowed, rained, and hailed the same morning.", 30 | "Read verse out loud for pleasure", 31 | "Hoist the load to your left shoulder.", 32 | "Take the winding path to reach the lake.", 33 | "Note closely the size of the gas tank.", 34 | "Wipe the grease off his dirty face.", 35 | "Mend the coat before you go out.", 36 | "The wrist was badly strained and hung limp.", 37 | "The stray cat gave birth to kittens.", 38 | "The young girl gave no clear response.", 39 | "The meal was cooked before the bell rang", 40 | "What joy there is in living.", 41 | "A king ruled the state in the early days.", 42 | "The ship was torn apart on the sharp reef.", 43 | "Sickness kept him home the third week.", 44 | "The wide road shimmered in the hot sun.", 45 | "The lazy cow lay in the cool grass.", 46 | "Lift the square stone over the fence.", 47 | "The rope will bind the seven books at once.", 48 | "Hop over the fence and plunge in.", 49 | "The friendly gang left the drug store.", 50 | "Mesh wire keeps chicks inside.", 51 | "The frosty air passed through the coat.", 52 | "The crooked maze failed to fool the mouse.", 53 | "Adding fast leads to wrong sums.", 54 | "The show was a flop from the very start.", 55 | "A saw is a tool used for making boards", 56 | "The wagon moved on well oiled wheels", 57 | "March the soldiers past the next hill.", 58 | "A cup of sugar makes sweet fudge.", 59 | "Place a rosebush near the porch steps.", 60 | "Both lost their lives in the raging storm.", 61 | "We talked of the side show in the circus", 62 | "Use a pencil to write the first draft", 63 | "He ran half way to the hardware store.", 64 | "The clock struck to mark the third period", 65 | "A small creek cut across the field", 66 | "Cars and busses stalled in snow drifts", 67 | "The set of china hit the floor with a crash.", 68 | "This is a grand season for hikes on the road", 69 | "The dune rose from the edge of the water.", 70 | "Those words were the cue for the actor to leave", 71 | "A yacht slid around the point into the bay", 72 | "The two met while playing on the sand", 73 | "The ink stain dried on the finished page", 74 | "The walled town was seized without a fight.", 75 | "The lease ran out in sixteen weeks.", 76 | "A tame squirrel makes a nice pet.", 77 | "The horn of the car woke the sleeping cop.", 78 | "The heart beat strongly and with firm strokes.", 79 | "The pearl was worn in a thin silver ring.", 80 | "The fruit peel was cut in thick slices", 81 | "The Navy attacked the big task force", 82 | "See the cat glaring at the scared mouse.", 83 | "There are more than two factors here", 84 | "The hat brim was wide and too droopy", 85 | "The lawyer tried to lose his case", 86 | "The grass curled around the fence post", 87 | "Cut the pie into large parts", 88 | "Men strive but seldom get rich.", 89 | "Always close the barn door tight", 90 | "He lay prone and hardly moved a limb", 91 | "The slush lay deep along the street.", 92 | "A wisp of cloud hung in the blue air.", 93 | "A pound of sugar costs more than eggs", 94 | "The fin was sharp and cut the clear water", 95 | "The play seems dull and quite stupid", 96 | "Bail the boat to stop it from sinking", 97 | "The term ended in late june that year", 98 | "A Tusk is used to make costly gifts", 99 | "Ten pins were set in order", 100 | "The bill was paid every third week", 101 | "Oak is strong and also gives shade", 102 | "Cats and Dogs each hate the other", 103 | "The pipe began to rust while new", 104 | "Open the crate but don't break the glass", 105 | "Add the sum to the product of these three", 106 | "Thieves who rob friends deserve jail", 107 | "The ripe taste of cheese improves with age", 108 | "Act on these orders with great speed", 109 | "The hog crawled under the high fence", 110 | "Move the vat over the hot fire", 111 | "The bark of the pine tree was shiny and dark", 112 | "Leaves turn brown and yellow in the fall.", 113 | "The pennant waved when the wind blew", 114 | "Split the log with a quick, sharp blow", 115 | "Burn peat after the logs give out", 116 | "He ordered peach pie with ice cream", 117 | "Weave the carpet on the right hand side", 118 | "Hemp is a weed found in parts of the tropics", 119 | "A lame back kept his score low", 120 | "We find joy in the simplest things", 121 | "Type out three lists of orders", 122 | "The harder he tried the less he got done", 123 | "The boss ran the show with a watchful eye", 124 | "The cup cracked and spilled its contents", 125 | "Paste can cleanse the most dirty brass", 126 | "The slang word for raw whiskey is booze", 127 | "It caught its hind paw in a rusty trap", 128 | "The wharf could be seen at the farther shore", 129 | "Feel the heat of the weak dying flame", 130 | "The tiny girl took off her hat", 131 | "A cramp is no small danger on a swim", 132 | "He said the same phrase thirty times", 133 | "Pluck the bright rose without leaves", 134 | "Two plus seven is less than ten", 135 | "The glow deepened in the eyes of the sweet girl", 136 | "Bring your problems to the wise chief", 137 | "Write a fond note to the friend you cherish", 138 | "Clothes and lodging are free to new men", 139 | "We frown when events take a bad turn", 140 | "Port is a strong wine with a smoky taste", 141 | "The young kid jumped the rusty gate", 142 | "Guess the result from the first scores.", 143 | "A salt pickle tastes fine with ham", 144 | "The just claim got the right verdict", 145 | "Those thistles bend in a high wind", 146 | "Pure bred poodles have curls", 147 | "The tree top waved in a graceful way.", 148 | "The spot on the blotter was made by green ink.", 149 | "Mud was spattered on the front of his white shirt", 150 | "The cigar burned a hole in the desk top.", 151 | "The empty flask stood on the tin tray", 152 | "A speedy man can beat this track mark.", 153 | "He broke a new shoelace that day", 154 | "The coffee stand is too high for the couch.", 155 | "The urge to write short stories is rare.", 156 | "The pencils have all been used", 157 | "The pirates seized the crew of the lost ship", 158 | "We tried to replace the coin but failed.", 159 | "She sewed the torn coat quite neatly.", 160 | "The sofa cushion is red and of light weight", 161 | "The jacket hung on the back of the wide chair", 162 | "At that high level the air is pure", 163 | "Drop the two when you add the figures", 164 | "A filing case is now hard to buy", 165 | "An abrupt start does not win the prize.", 166 | "Wood is best for making toys and blocks", 167 | "The office paint was a dull, sad tan", 168 | "He knew the skill of the great young actress", 169 | "A rag will soak up spilled water", 170 | "A shower of dirt fell from the hot pipes", 171 | "Steam hissed from the broken valve.", 172 | "The child almost hurt the small dog", 173 | "There was a sound of dry leaves outside", 174 | "The sky that morning was clear and bright blue", 175 | "Torn scraps littered the stone floor", 176 | "Sunday is the best part of the week.", 177 | "The doctor cured him with these pills", 178 | "The new girl was fired today at noon", 179 | "They felt gay when the ship arrived in port", 180 | "Add the store's account to the last cent", 181 | "Acid burns holes in wool cloth", 182 | "Fairy tales should be fun to write", 183 | "Eight miles of woodland burned to waste", 184 | "The third act was dull and tired the players", 185 | "A young child should not suffer fright", 186 | "Add the column and put the sum here", 187 | "We admire and love a good cook", 188 | "There the flood mark is ten inches", 189 | "He carved a head from the round block of marble", 190 | "She has a smart way of wearing clothes", 191 | "The fruit of a fig tree is apple shaped", 192 | "Corn cobs can be used to kindle a fire.", 193 | "Where were they when the noise started", 194 | "The paper box is full of thumb tacks", 195 | "Sell your gift to a buyer at a good gain", 196 | "The tongs lay beside the ice pail", 197 | "The petals fall with the next puff of wind", 198 | "Bring your best compass to the third class", 199 | "They could laugh although they were sad", 200 | "Farmers came in to thresh the oat crop", 201 | "The brown house was on fire to the attic", 202 | "The lure is used to catch trout and flounder", 203 | "Float the soap on top of the bath water", 204 | "A blue crane is a tall wading bird", 205 | "A fresh start will work such wonders", 206 | "The club rented the rink for the fifth night", 207 | "After the dance, they went straight home", 208 | "The hostess taught the new maid to serve", 209 | "He wrote his last novel there at the inn", 210 | "Even the worst will beat his low score", 211 | "The cement had dried when he moved it", 212 | "The loss of the second ship was hard to take", 213 | "The fly made its way along the wall", 214 | "Do that with a wooden stick", 215 | "Live wires should be kept covered", 216 | "The large house had hot water taps", 217 | "It is hard to erase blue or red ink", 218 | "Write at once or you may forget it", 219 | "The doorknob was made of bright clean brass", 220 | "The wreck occurred by the bank on Main Street", 221 | "A pencil with black lead writes best", 222 | "Coax a young calf to drink from a bucket", 223 | "Schools for ladies teach charm and grace", 224 | "The lamp shone with a steady green flame", 225 | "They took the axe and the saw to the forest", 226 | "The ancient coin was quite dull and worn", 227 | "The shaky barn fell with a loud crash.", 228 | "Jazz and swing fans like fast music", 229 | "Rake the rubbish up and then burn it", 230 | "Slash the gold cloth into fine ribbons", 231 | "Try to have the court decide the case", 232 | "They are pushed back each time they attack", 233 | "He broke his ties with groups of former friends", 234 | "They floated on the raft to sun their white backs", 235 | "The map had an X that meant nothing", 236 | "Whitings are small fish caught in nets", 237 | "Some ads serve to cheat buyers", 238 | "Jerk the rope and the bell rings weakly", 239 | "A waxed floor makes us lose balance", 240 | "Madam, this is the best brand of corn", 241 | "On the islands the sea breeze is soft and mild", 242 | "The play began as soon as we sat down", 243 | "This will lead the world to more sound and fury", 244 | "Add salt before you fry the egg", 245 | "The rush for funds reached its peak Tuesday", 246 | "The birch looked stark white and lonesome", 247 | "The box is held by a bright red snapper", 248 | "To make pure ice, you freeze water", 249 | "The first worm gets snapped early", 250 | "Jump the fence and hurry up the bank", 251 | "Yell and clap as the curtain slides back", 252 | "They are men who walk the middle of the road", 253 | "Both brothers wear the same size", 254 | "In some form or other we need fun", 255 | "The prince ordered his head chopped off", 256 | "The houses are built of red clay bricks", 257 | "Ducks fly north but lack a compass", 258 | "Fruit flavors are used in fizz drinks", 259 | "These pills do less good than others", 260 | "Canned pears lack full flavor", 261 | "The dark pot hung in the front closet", 262 | "Carry the pail to the wall and spill it there", 263 | "The train brought our hero to the big town", 264 | "We are sure that one war is enough", 265 | "Gray paint stretched for miles around", 266 | "The rude laugh filled the empty room", 267 | "High seats are best for football fans", 268 | "Tea served from the brown jug is tasty", 269 | "A dash of pepper spoils beef stew", 270 | "A zestful food is the hot-cross bun", 271 | "The horse trotted around the field at a brisk pace", 272 | "Find the twin who stole the pearl necklace", 273 | "Cut the cord that binds the box tightly", 274 | "The red tape bound the smuggled food", 275 | "Look in the corner to find the tan shirt", 276 | "The cold drizzle will halt the bond drive", 277 | "Nine men were hired to dig the ruins", 278 | "The junk yard had a mouldy smell", 279 | "The flint sputtered and lit a pine torch", 280 | "Soak the cloth and drown the sharp odor", 281 | "The shelves were bare of both jam or crackers", 282 | "A joy to every child is the swan boat", 283 | "All sat frozen and watched the screen", 284 | "A cloud of dust stung his tender eyes", 285 | "To reach the end he needs much courage", 286 | "Shape the clay gently into block form", 287 | "A ridge on a smooth surface is a bump or flaw", 288 | "Hedge apples may stain your hands green", 289 | "Quench your thirst, then eat the crackers", 290 | "Tight curls get limp on rainy days", 291 | "The mute muffled the high tones of the horn", 292 | "The gold ring fits only a pierced ear", 293 | "The old pan was covered with hard fudge", 294 | "Watch the log float in the wide river", 295 | "The node on the stalk of wheat grew daily", 296 | "The heap of fallen leaves was set on fire", 297 | "Write fast if you want to finish early", 298 | "His shirt was clean but one button was gone", 299 | "The barrel of beer was a brew of malt and hops", 300 | "Tin cans are absent from store shelves", 301 | "Slide the box into that empty space", 302 | "The plant grew large and green in the window", 303 | "The beam dropped down on the workman's head", 304 | "Pink clouds floated with the breeze", 305 | "She danced like a swan, tall and graceful", 306 | "The tube was blown and the tire flat and useless", 307 | "It is late morning on the old wall clock", 308 | "Let's all join as we sing the last chorus", 309 | "The last switch cannot be turned off", 310 | "The fight will end in just six minutes", 311 | "The store walls were lined with colored frocks", 312 | "The peace league met to discuss their plans.", 313 | "The rise to fame of a person takes luck", 314 | "Paper is scarce, so write with much care", 315 | "The quick fox jumped on the sleeping cat", 316 | "The nozzle of the fire hose was bright brass", 317 | "Screw the round cap on as tight as needed", 318 | "Time brings us many changes", 319 | "The purple tie was ten years old", 320 | "Men think and plan and sometimes act", 321 | "Fill the ink jar with sticky glue", 322 | "He smoke a big pipe with strong contents", 323 | "We need grain to keep our mules healthy", 324 | "Pack the records in a neat thin case", 325 | "The crunch of feet in the snow was the only sound", 326 | "The copper bowl shone in the sun's rays", 327 | "Boards will warp unless kept dry.", 328 | "The plush chair leaned against the wall.", 329 | "Glass will clink when struck by metal", 330 | "Bathe and relax in the cool green grass", 331 | "Nine rows of soldiers stood in a line", 332 | "The beach is dry and shallow at low tide", 333 | "The idea is to sew both edges straight", 334 | "The kitten chased the dog down the street", 335 | "Pages bound in cloth make a book", 336 | "Try to trace the fine lines of the painting", 337 | "Women form less than half of the group.", 338 | "The zones merge in the central part of town", 339 | "A gem in the rough needs work to polish", 340 | "Code is used when secrets are sent", 341 | "Most of the news is easy for us to hear", 342 | "He used the lathe to make brass objects", 343 | "The vane on top of the pole revolved in the wind", 344 | "Mince pie is a dish served to children", 345 | "The clan gathered on each dull night", 346 | "Let it burn, it gives us warmth and comfort", 347 | "A castle built from sand fails to endure", 348 | "A child's wit saved the day for us", 349 | "Tack the strip of carpet to the worn floor", 350 | "Next Tuesday we must vote", 351 | "Pour the stew from the pot into the plate", 352 | "Each penny shone like new", 353 | "The man went to the woods to gather sticks", 354 | "The dirt piles were lines along the road", 355 | "The logs fell and tumbled into the clear stream", 356 | "Just hoist it up and take it away", 357 | "A ripe plum is fit for a king's palate", 358 | "Our plans right now are hazy.", 359 | "Brass rings are sold by these natives", 360 | "It takes a good trap to capture a bear", 361 | "Feed the white mouse some flower seeds", 362 | "The thaw came early and freed the stream", 363 | "He took the lead and kept it the whole distance", 364 | "The key you designed will fit the lock", 365 | "Plead to the council to free the poor thief", 366 | "Better hash is made of rare beef", 367 | "This plank was made for walking on ", 368 | "The lake sparkled in the red hot sun", 369 | "He crawled with care along the ledge", 370 | "Tend the sheep while the dog wanders", 371 | "It takes a lot of help to finish these", 372 | "Mark the spot with a sign painted red", 373 | "Take two shares as a fair profit.", 374 | "The fur of cats goes by many names", 375 | "North winds bring colds and fevers", 376 | "He asks no person to vouch for him", 377 | "Go now and come here later", 378 | "A sash of gold silk will trim her dress", 379 | "Soap can wash most dirt away", 380 | "That move means the game is over", 381 | "He wrote down a long list of items", 382 | "A siege will crack the strong defense", 383 | "Grape juice and water mix well", 384 | "Roads are paved with sticky tar", 385 | "Fake stones shine but cost little", 386 | "The drip of the rain made a pleasant sound.", 387 | "Smoke poured out of every crack.", 388 | "Serve the hot rum to the tired heroes", 389 | "Much of the story makes good sense.", 390 | "The sun came up to light the eastern sky", 391 | "Heave the line over the port side", 392 | "A lathe cuts and trims any wood", 393 | "It's a dense crowd in two distinct ways", 394 | "His hip struck the knee of the next player", 395 | "The stale smell of old beer lingers", 396 | "The desk was firm on the shaky floor", 397 | "It takes heat to bring out the odor", 398 | "Beef is scarcer than some lamb", 399 | "Raise the sail and steer the ship northward", 400 | "A cone costs five cents on Mondays", 401 | "A pod is what peas always grow in", 402 | "Jerk that dart from the cork target", 403 | "No cement will hold hard wood", 404 | "We now have a new base for shipping", 405 | "A list of names is carved around the base", 406 | "The sheep were led home by a dog", 407 | "Three for a dime, the young peddler cried", 408 | "The sense of smell is better than that of touch", 409 | "No hardship seemed to make him sad", 410 | "Grace makes up for lack of beauty", 411 | "Nudge gently but wake her now", 412 | "The news struck doubt into restless minds", 413 | "Once we stood beside the shore", 414 | "A chink in the wall allowed a draft to blow", 415 | "Fasten two pins on each side", 416 | "A cold dip restores health and zest", 417 | "He takes the oath of office each March", 418 | "The sand drifts over the sills of the old house", 419 | "The point of the steel pen was bent and twisted", 420 | "There is a lag between thought and act", 421 | "Seed is needed to plant the spring corn", 422 | "Draw the chart with heavy black lines", 423 | "The boy owed his pal thirty cents", 424 | "The chap slipped into the crowd and was lost", 425 | "Hats are worn to tea and not to dinner", 426 | "The ramp led up to the wide highway", 427 | "Beat the dust from the rug onto the lawn", 428 | "Say it slowly but make it ring clear", 429 | "The straw nest housed five robins", 430 | "Screen the porch with woven straw mats", 431 | "This horse will nose his way to the finish", 432 | "The dry wax protects the deep scratch", 433 | "He picked up the dice for a second roll", 434 | "These coins will be needed to pay his debt", 435 | "The nag pulled the frail cart along", 436 | "Twist the valve and release hot steam", 437 | "The vamp of the shoe had a gold buckle", 438 | "The smell of burned rags itches my nose", 439 | "New pants lack cuffs and pockets", 440 | "The marsh will freeze when cold enough", 441 | "They slice the sausage thin with a knife", 442 | "The bloom of the rose lasts a few days", 443 | "A gray mare walked before the colt", 444 | "Breakfast buns are fine with a hot drink", 445 | "Bottles hold four kinds of rum", 446 | "The man wore a feather in his felt hat", 447 | "He wheeled the bike past the winding road", 448 | "Drop the ashes on the worn old rug", 449 | "The desk and both chairs were painted tan", 450 | "Throw out the used paper cup and plate", 451 | "A clean neck means a neat collar", 452 | "The couch cover and hall drapes were blue", 453 | "The stems of the tall glasses cracked and broke", 454 | "The wall phone rang loud and often", 455 | "The clothes dried on a thin wooden rack", 456 | "Turn out the lantern which gives us light", 457 | "The cleat sank deeply into the soft turf", 458 | "The bills were mailed promptly on the tenth of the month", 459 | "To have is better than to wait and hope", 460 | "The price is fair for a good antique clock", 461 | "The music played on while they talked", 462 | "Dispense with a vest on a day like this", 463 | "The bunch of grapes was pressed into wine", 464 | "He sent the figs, but kept the ripe cherries", 465 | "The hinge on the door creaked with old age", 466 | "The screen before the fire kept in the sparks", 467 | "Fly by night and you waste little time", 468 | "Thick glasses helped him read the print", 469 | "Birth and death marks the limits of life", 470 | "The chair looked strong but had no bottom", 471 | "The kite flew wildly in the high wind", 472 | "A fur muff is stylish once more", 473 | "The tin box held priceless stones", 474 | "We need an end of all such matter", 475 | "The case was puzzling to the old and wise", 476 | "The bright lanterns were gay on the dark lawn", 477 | "We don't get much money but we have fun", 478 | "The youth drove with zest, but little skill", 479 | "Five years he lived with a shaggy dog", 480 | "A fence cuts through the corner lot", 481 | "The way to save money is not to spend much", 482 | "Shut the hatch before the waves push it in", 483 | "The odor of spring makes young hearts jump", 484 | "Crack the walnut with your sharp side teeth", 485 | "He offered proof in the form of a large chart", 486 | "Send the stuff in a thick paper bag", 487 | "A quart of milk is water for the most part", 488 | "They told wild tales to frighten him", 489 | "The three story house was built of stone", 490 | "In the rear of the ground floor was a large passage", 491 | "A man in a blue sweater sat at the desk", 492 | "Oats are a food eaten by horse and man", 493 | "Their eyelids droop for want of sleep", 494 | "A sip of tea revives his tired friend", 495 | "There are many ways to do these things", 496 | "Tuck the sheet under the edge of the mat", 497 | "A force equal to that would move the earth", 498 | "We like to see clear weather", 499 | "The work of the tailor is seen on each side", 500 | "Take a chance and win a china doll", 501 | "Shake the dust from your shoes, stranger", 502 | "She was kind to sick old people", 503 | "The square wooden crate was packed to be shipped", 504 | "The dusty bench stood by the stone wall", 505 | "We dress to suit the weather of most days", 506 | "Smile when you say nasty words", 507 | "A bowl of rice is free with chicken stew", 508 | "The water in this well is a source of good health", 509 | "Take shelter in this tent, but keep still", 510 | "That guy is the writer of a few banned books", 511 | "The little tales they tell are false", 512 | "The door was barred, locked, and bolted as well", 513 | "Ripe pears are fit for a queen's table", 514 | "A big wet stain was on the round carpet", 515 | "The kite dipped and swayed, but stayed aloft", 516 | "The pleasant hours fly by much too soon", 517 | "The room was crowded with a wild mob", 518 | "This strong arm shall shield your honor", 519 | "She blushed when he gave her a white orchid", 520 | "The beetle droned in the hot June sun", 521 | "Press the pedal with your left foot", 522 | "Neat plans fail without luck", 523 | "The black trunk fell from the landing", 524 | "The bank pressed for payment of the debt", 525 | "The theft of the pearl pin was kept secret", 526 | "Shake hands with this friendly child", 527 | "The vast space stretched into the far distance", 528 | "A rich farm is rare in this sandy waste", 529 | "His wide grin earned many friends", 530 | "Flax makes a fine brand of paper", 531 | "Hurdle the pit with the aid of a long pole", 532 | "A strong bid may scare your partner stiff", 533 | "Even a just cause needs power to win", 534 | "Peep under the tent and see the clowns", 535 | "The leaf drifts along with a slow spin", 536 | "Cheap clothes are flashy but don't last", 537 | "A thing of small note can cause despair", 538 | "Flood the mails with requests for this book", 539 | "A thick coat of black paint covered all", 540 | "The pencil was cut to be sharp at both ends", 541 | "Those last words were a strong statement", 542 | "He wrote his name boldly at the top of the sheet", 543 | "Dill pickles are sour but taste fine", 544 | "Down that road is the way to the grain farmer", 545 | "Either mud or dust are found at all times", 546 | "The best method is to fix it in place with clips", 547 | "If you mumble your speech will be lost", 548 | "At night the alarm roused him from a deep sleep", 549 | "Read just what the meter says", 550 | "Fill your pack with bright trinkets for the poor", 551 | "The small red neon lamp went out", 552 | "Clams are small, round, soft, and tasty", 553 | "The fan whirled its round blades softly", 554 | "The line where the edges join was clean", 555 | "Breathe deep and smell the piny air", 556 | "It matters not if he reads these words or those", 557 | "A brown leather bag hung from its strap", 558 | "A toad and a frog are hard to tell apart", 559 | "A white silk jacket goes with any shoes", 560 | "A break in the dam almost caused a flood", 561 | "Paint the sockets in the wall dull green", 562 | "The child crawled into the dense grass", 563 | "Bribes fail where honest men work", 564 | "Trample the spark, else the flames will spread", 565 | "The hilt of the sword was carved with fine designs", 566 | "A round hole was drilled through the thin board", 567 | "Footprints showed the path he took up the beach", 568 | "She was waiting at my front lawn", 569 | "A vent near the edge brought in fresh air", 570 | "Prod the old mule with a crooked stick", 571 | "It is a band of steel three inches wide", 572 | "The pipe ran almost the length of the ditch", 573 | "It was hidden from sight by a mass of leaves and shrubs", 574 | "The weight of the package was seen on the high scale", 575 | "Wake and rise, and step into the green outdoors", 576 | "The green light in the brown box flickered", 577 | "The brass tube circled the high wall", 578 | "The lobes of her ears were pierced to hold rings", 579 | "Hold the hammer near the end to drive the nail", 580 | "Next Sunday is the twelfth of the month", 581 | "Every word and phrase he speaks is true", 582 | "He put his last cartridge into the gun and fired", 583 | "They took their kids from the public school", 584 | "Drive the screw straight into the wood", 585 | "Keep the hatch tight and the watch constant", 586 | "Sever the twine with a quick snip of the knife", 587 | "Paper will dry out when wet", 588 | "Slide the catch back and open the desk", 589 | "Help the weak to preserve their strength", 590 | "A sullen smile gets few friends", 591 | "Stop whistling and watch the boys march", 592 | "Jerk the cord, and out tumbles the gold", 593 | "Slide the tray across the glass top", 594 | "The cloud moved in a stately way and was gone", 595 | "Light maple makes for a swell room", 596 | "Set the piece here and say nothing", 597 | "Dull stories make her laugh", 598 | "A stiff cord will do to fasten your shoe", 599 | "Get the trust fund to the bank early", 600 | "Choose between the high road and the low", 601 | "A plea for funds seems to come again", 602 | "He lent his coat to the tall gaunt stranger", 603 | "There is a strong chance it will happen once more", 604 | "The duke left the park in a silver coach", 605 | "Greet the new guests and leave quickly", 606 | "When the frost has come it is time for turkey", 607 | "Sweet words work better than fierce", 608 | "A thin stripe runs down the middle", 609 | "A six comes up more often than a ten", 610 | "Lush ferns grow on the lofty rocks", 611 | "The ram scared the school children off", 612 | "The team with the best timing looks good", 613 | "The farmer swapped his horse for a brown ox", 614 | "Sit on the perch and tell the others what to do", 615 | "A steep trail is painful for our feet", 616 | "The early phase of life moves fast", 617 | "Green moss grows on the northern side", 618 | "Tea in thin china has a sweet taste", 619 | "Pitch the straw through the door of the stable", 620 | "The latch on the back gate needed a nail", 621 | "The goose was brought straight from the old market", 622 | "The sink is the thing in which we pile dishes", 623 | "A whiff of it will cure the most stubborn cold", 624 | "The facts don't always show who is right", 625 | "She flaps her cape as she parades the street", 626 | "The loss of the cruiser was a blow to the fleet", 627 | "Loop the braid to the left and then over", 628 | "Plead with the lawyer to drop the lost cause", 629 | "Calves thrive on tender spring grass", 630 | "Post no bills on this office wall", 631 | "Tear a thin sheet from the yellow pad", 632 | "A cruise in warm waters in a sleek yacht is fun", 633 | "A streak of color ran down the left edge", 634 | "It was done before the boy could see it", 635 | "Crouch before you jump or miss the mark", 636 | "Pack the kits and don't forget the salt", 637 | "The square peg will settle in the round hole", 638 | "Fine soap saves tender skin", 639 | "Poached eggs and tea must suffice", 640 | "Bad nerves are jangled by a door slam", 641 | "Ship maps are different from those for planes", 642 | "Dimes showered down from all sides", 643 | "They sang the same tunes at each party", 644 | "The sky in the west is tinged with orange red", 645 | "The pods of peas ferment in bare fields", 646 | "The horse balked and threw the tall rider", 647 | "The hitch between the horse and cart broke", 648 | "Pile the coal high in the shed corner", 649 | "A gold vase is both rare and costly", 650 | "The knife was hung inside its bright sheath", 651 | "The rarest spice comes from the far East", 652 | "The roof should be tilted at a sharp slant", 653 | "A smatter of French is worse than none", 654 | "The mule trod the treadmill day and night", 655 | "The aim of the contest is to raise a great fund", 656 | "To send it now in large amounts is bad", 657 | "There is a fine hard tang in salty air", 658 | "Cod is the main business of the north shore", 659 | "The slab was hewn from heavy blocks of slate", 660 | "Dunk the stale biscuits into strong drink", 661 | "Hang tinsel from both branches", 662 | "Cap the jar with a tight brass cover", 663 | "The poor boy missed the boat again", 664 | "Be sure to set that lamp firmly in the hole", 665 | "Pick a card and slip it under the pack", 666 | "A round mat will cover the dull spot", 667 | "The first part of the plan needs changing", 668 | "A good book informs of what we ought to know", 669 | "The mail comes in three batches per day", 670 | "You cannot brew tea in a cold pot", 671 | "Dots of light betrayed the black cat", 672 | "Put the chart on the mantel and tack it down", 673 | "The night shift men rate extra pay", 674 | "The red paper brightened the dim stage", 675 | "See the player scoot to third base", 676 | "Slide the bill between the two leaves", 677 | "Many hands help get the job done", 678 | "We don't like to admit our small faults", 679 | "No doubt about the way the wind blows", 680 | "Dig deep in the earth for pirate's gold", 681 | "The steady drip is worse than a drenching rain", 682 | "A flat pack takes less luggage space", 683 | "Green ice frosted the punch bowl", 684 | "A stuffed chair slipped from the moving van", 685 | "The stitch will serve but needs to be shortened", 686 | "A thin book fits in the side pocket", 687 | "The gloss on top made it unfit to read", 688 | "The hail pattered on the burnt brown grass", 689 | "Seven seals were stamped on great sheets", 690 | "Our troops are set to strike heavy blows", 691 | "The store was jammed before the sale could start", 692 | "It was a bad error on the part of the new judge", 693 | "One step more and the board will collapse", 694 | "Take the match and strike it against your shoe", 695 | "The pot boiled but the contents failed to jell", 696 | "The baby puts his right foot in his mouth", 697 | "The bombs left most of the town in ruins", 698 | "Stop and stare at the hard working man", 699 | "The streets are narrow and full of sharp turns", 700 | "The pup jerked the leash as he saw a feline shape", 701 | "Open your book to the first page", 702 | "Fish evade the net and swim off", 703 | "Dip the pail once and let it settle", 704 | "Will you please answer that phone", 705 | "The big red apple fell to the ground", 706 | "The curtain rose and the show was on", 707 | "The young prince became heir to the throne", 708 | "He sent the boy on a short errand", 709 | "Leave now and you will arrive on time", 710 | "The corner store was robbed last night", 711 | "A gold ring will please most any girl", 712 | "The long journey home took a year", 713 | "She saw a cat in the neighbor's house", 714 | "A pink shell was found on the sandy beach", 715 | "Small children came to see him", 716 | "The grass and bushes were wet with dew", 717 | "The blind man counted his old coins", 718 | "A severe storm tore down the barn", 719 | "She called his name many times", 720 | "When you hear the bell, come quickly", 721 | -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AccentDB/code/1b1b0a6fba57e94a3e4549e31340e5a39851e2eb/helpers/__init__.py -------------------------------------------------------------------------------- /helpers/alt_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from scipy.io import wavfile 4 | import os 5 | import numpy as np 6 | import argparse 7 | from tqdm import tqdm 8 | 9 | # Utility functions 10 | xrange = range 11 | def windows(signal, window_size, step_size): 12 | if type(window_size) is not int: 13 | raise AttributeError("Window size must be an integer.") 14 | if type(step_size) is not int: 15 | raise AttributeError("Step size must be an integer.") 16 | for i_start in xrange(0, len(signal), step_size): 17 | i_end = i_start + window_size 18 | if i_end >= len(signal): 19 | break 20 | yield signal[i_start:i_end] 21 | 22 | def energy(samples): 23 | return np.sum(np.power(samples, 2.)) / float(len(samples)) 24 | 25 | def rising_edges(binary_signal): 26 | previous_value = 0 27 | index = 0 28 | for x in binary_signal: 29 | if x and not previous_value: 30 | yield index 31 | previous_value = x 32 | index += 1 33 | 34 | # Process command line arguments 35 | 36 | parser = argparse.ArgumentParser(description='Split a WAV file at silence.') 37 | parser.add_argument('input_file', type=str, help='The WAV file to split.') 38 | parser.add_argument('--output-dir', '-o', type=str, default='.', help='The output folder. Defaults to the current folder.') 39 | parser.add_argument('--min-silence-length', '-m', type=float, default=3., help='The minimum length of silence at which a split may occur [seconds]. Defaults to 3 seconds.') 40 | parser.add_argument('--silence-threshold', '-t', type=float, default=1e-6, help='The energy level (between 0.0 and 1.0) below which the signal is regarded as silent. Defaults to 1e-6 == 0.0001%.') 41 | parser.add_argument('--step-duration', '-s', type=float, default=None, help='The amount of time to step forward in the input file after calculating energy. Smaller value = slower, but more accurate silence detection. Larger value = faster, but might miss some split opportunities. Defaults to (min-silence-length / 10.).') 42 | parser.add_argument('--dry-run', '-n', action='store_true', help='Don\'t actually write any output files.') 43 | 44 | args = parser.parse_args() 45 | 46 | input_filename = args.input_file 47 | window_duration = args.min_silence_length 48 | if args.step_duration is None: 49 | step_duration = window_duration / 10. 50 | else: 51 | step_duration = args.step_duration 52 | silence_threshold = args.silence_threshold 53 | output_dir = args.output_dir 54 | output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0] 55 | dry_run = args.dry_run 56 | 57 | # print("Splitting f{} where energy is below f{}% for longer than {}s.".format( 58 | # input_filename, 59 | # silence_threshold * 100., 60 | # window_duration 61 | # )) 62 | 63 | # Read and split the file 64 | 65 | sample_rate, samples = input_data=wavfile.read(filename=input_filename, mmap=True) 66 | 67 | max_amplitude = np.iinfo(samples.dtype).max 68 | max_energy = energy([max_amplitude]) 69 | 70 | window_size = int(window_duration * sample_rate) 71 | step_size = int(step_duration * sample_rate) 72 | 73 | signal_windows = windows( 74 | signal=samples, 75 | window_size=window_size, 76 | step_size=step_size 77 | ) 78 | 79 | window_energy = (energy(w) / max_energy for w in tqdm( 80 | signal_windows, 81 | total=int(len(samples) / float(step_size)) 82 | )) 83 | 84 | window_silence = (e > silence_threshold for e in window_energy) 85 | 86 | cut_times = (r * step_duration for r in rising_edges(window_silence)) 87 | 88 | # This is the step that takes long, since we force the generators to run. 89 | print("Finding silences...") 90 | cut_samples = [int(t * sample_rate) for t in cut_times] 91 | cut_samples.append(-1) 92 | 93 | cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in xrange(len(cut_samples) - 1)] 94 | 95 | for i, start, stop in tqdm(cut_ranges): 96 | output_file_path = "{}_{:03d}.wav".format( 97 | os.path.join(output_dir, output_filename_prefix), 98 | i 99 | ) 100 | if not dry_run: 101 | print("Writing file ", output_file_path) 102 | wavfile.write( 103 | filename=output_file_path, 104 | rate=sample_rate, 105 | data=samples[start:stop] 106 | ) 107 | -------------------------------------------------------------------------------- /helpers/convert_raw_to_processed.py: -------------------------------------------------------------------------------- 1 | # from file_manager import FileManager 2 | 3 | # root = '/home/enigmaeth/Videos/accentPhase2/raw' 4 | # accepted_formats = ['wav', 'mp3'] 5 | # FM = FileManager(root, accepted_formats) 6 | 7 | # all_files = FM.get_all_files() 8 | 9 | # for file in all_files: 10 | # language = file.split(' ')[0].split('/')[-1] 11 | 12 | # Import the AudioSegment class for processing audio and the 13 | # split_on_silence function for separating out silent chunks. 14 | from pydub import AudioSegment 15 | from pydub.silence import split_on_silence 16 | 17 | # Define a function to normalize a chunk to a target amplitude. 18 | def match_target_amplitude(aChunk, target_dBFS): 19 | ''' Normalize given audio chunk ''' 20 | change_in_dBFS = target_dBFS - aChunk.dBFS 21 | return aChunk.apply_gain(change_in_dBFS) 22 | 23 | # Load your audio. 24 | song = AudioSegment.from_mp3("/home/enigmaeth/Videos/accentPhase2/raw/sb.wav") 25 | 26 | # Split track where the silence is 2 seconds or more and get chunks using 27 | # the imported function. 28 | chunks = split_on_silence ( 29 | # Use the loaded audio. 30 | song, 31 | # Specify that a silent chunk must be at least 2 seconds or 2000 ms long. 32 | min_silence_len = 1, 33 | # Consider a chunk silent if it's quieter than -16 dBFS. 34 | # (You may want to adjust this parameter.) 35 | silence_thresh = 1000 36 | ) 37 | print("split ", len(chunks)) 38 | 39 | # Process each chunk with your parameters 40 | for i, chunk in enumerate(chunks): 41 | # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding. 42 | # silence_chunk = AudioSegment.silent(duration=500) 43 | 44 | # Add the padding chunk to beginning and end of the entire chunk. 45 | # audio_chunk = silence_chunk + chunk + silence_chunk 46 | 47 | # Normalize the entire chunk. 48 | # normalized_chunk = match_target_amplitude(audio_chunk, -20.0) 49 | 50 | # Export the audio chunk with new bitrate. 51 | print("Exporting chunk{0}.mp3.".format(i)) 52 | normalized_chunk.export( 53 | ".//chunk{0}.mp3".format(i), 54 | bitrate = "192k", 55 | format = "mp3" 56 | ) -------------------------------------------------------------------------------- /helpers/file_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class FileManager: 4 | """ 5 | This class provides functions related to file management required for the indexer 6 | """ 7 | 8 | def __init__(self, root, accepted_formats): 9 | """ 10 | initialize variables: root path and accepted formats for the indexer 11 | """ 12 | self.root = root 13 | self.accepted_formats = accepted_formats 14 | 15 | 16 | def get_all_files(self): 17 | """ 18 | List all files recursively in the root specified by root 19 | """ 20 | files_list = [] 21 | for path, subdirs, files in os.walk(self.root): 22 | for name in files: 23 | files_list.append(os.path.join(self.root, name)) 24 | return files_list 25 | 26 | 27 | def get_files_to_be_processed(self): 28 | """ 29 | returns list of files to be included in the index 30 | set `root` variable to the desired root 31 | :return: list of files to be processed 32 | """ 33 | files = self.get_all_files() 34 | files_list = [] 35 | for name in files: 36 | if(name.split('.')[-1] in self.accepted_formats and os.stat(os.path.join(self.root, name)).st_size < 5000000): 37 | files_list.append(os.path.join(self.root, name)) 38 | return files_list[0:-1] -------------------------------------------------------------------------------- /helpers/run.py: -------------------------------------------------------------------------------- 1 | # Import the AudioSegment class for processing audio and the 2 | # split_on_silence function for separating out silent chunks. 3 | from pydub import AudioSegment 4 | from pydub.silence import split_on_silence 5 | import os 6 | 7 | FORMAT = 'mp3' 8 | 9 | # Define a function to normalize a chunk to a target amplitude. 10 | def match_target_amplitude(aChunk, target_dBFS): 11 | ''' Normalize given audio chunk ''' 12 | change_in_dBFS = target_dBFS - aChunk.dBFS 13 | return aChunk.apply_gain(change_in_dBFS) 14 | 15 | # Load your audio. 16 | song = AudioSegment.from_file(os.path.join("Bangla Jaya REC20190704135235.mp3"), format=FORMAT) 17 | 18 | print(song) 19 | 20 | # Split track where the silence is 2 seconds or more and get chunks using 21 | # the imported function. 22 | chunks = split_on_silence ( 23 | # Use the loaded audio. 24 | song, 25 | # Specify that a silent chunk must be at least 2 seconds or 2000 ms long. 26 | min_silence_len = 1000, 27 | # Consider a chunk silent if it's quieter than -16 dBFS. 28 | # (You may want to adjust this parameter.) 29 | silence_thresh = -16 30 | ) 31 | 32 | print(chunks) 33 | 34 | # Process each chunk with your parameters 35 | for i, chunk in enumerate(chunks): 36 | # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding. 37 | silence_chunk = AudioSegment.silent(duration=500) 38 | 39 | # Add the padding chunk to beginning and end of the entire chunk. 40 | audio_chunk = silence_chunk + chunk + silence_chunk 41 | 42 | # Normalize the entire chunk. 43 | normalized_chunk = match_target_amplitude(audio_chunk, -20.0) 44 | 45 | # Export the audio chunk with new bitrate. 46 | print("Exporting chunk{0}.{1}".format(i, FORMAT)) 47 | normalized_chunk.export( 48 | ".//chunk{0}.{1}".format(i, FORMAT), 49 | bitrate = "192k", 50 | format = FORMAT 51 | ) 52 | -------------------------------------------------------------------------------- /helpers/split_to_wav.sh.save: -------------------------------------------------------------------------------- 1 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 2 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 3 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 4 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 5 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 6 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 7 | python alt_split.py /home/enigmaeth/accentPhase2/data/all_accents/Bangla_Jay.wav --output-dir out/ --min-silence-length=2 --silence-threshold=0.01 8 | -------------------------------------------------------------------------------- /helpers/split_wav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from scipy.io import wavfile 4 | import os 5 | import numpy as np 6 | import argparse 7 | from tqdm import tqdm 8 | 9 | # Utility functions 10 | 11 | def windows(signal, window_size, step_size): 12 | if type(window_size) is not int: 13 | raise AttributeError("Window size must be an integer.") 14 | if type(step_size) is not int: 15 | raise AttributeError("Step size must be an integer.") 16 | for i_start in xrange(0, len(signal), step_size): 17 | i_end = i_start + window_size 18 | if i_end >= len(signal): 19 | break 20 | yield signal[i_start:i_end] 21 | 22 | def energy(samples): 23 | return np.sum(np.power(samples, 2.)) / float(len(samples)) 24 | 25 | def rising_edges(binary_signal): 26 | previous_value = 0 27 | index = 0 28 | for x in binary_signal: 29 | if x and not previous_value: 30 | yield index 31 | previous_value = x 32 | index += 1 33 | 34 | # Process command line arguments 35 | 36 | parser = argparse.ArgumentParser(description='Split a WAV file at silence.') 37 | parser.add_argument('--input_file', type=str, help='The WAV file to split.') 38 | parser.add_argument('--output-dir', '-o', type=str, default='.', help='The output folder. Defaults to the current folder.') 39 | parser.add_argument('--min-silence-length', '-m', type=float, default=3., help='The minimum length of silence at which a split may occur [seconds]. Defaults to 3 seconds.') 40 | parser.add_argument('--silence-threshold', '-t', type=float, default=1e-6, help='The energy level (between 0.0 and 1.0) below which the signal is regarded as silent. Defaults to 1e-6 == 0.0001%.') 41 | parser.add_argument('--step-duration', '-s', type=float, default=None, help='The amount of time to step forward in the input file after calculating energy. Smaller value = slower, but more accurate silence detection. Larger value = faster, but might miss some split opportunities. Defaults to (min-silence-length / 10.).') 42 | parser.add_argument('--dry-run', '-n', action='store_true', help='Don\'t actually write any output files.') 43 | 44 | args = parser.parse_args() 45 | 46 | input_filename = args.input_file 47 | window_duration = args.min_silence_length 48 | if args.step_duration is None: 49 | step_duration = window_duration / 10. 50 | else: 51 | step_duration = args.step_duration 52 | silence_threshold = args.silence_threshold 53 | output_dir = args.output_dir 54 | output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0] 55 | dry_run = args.dry_run 56 | 57 | # print("Splitting f{} where energy is below f{}% for longer than {}s.".format( 58 | # input_filename, 59 | # silence_threshold * 100., 60 | # window_duration 61 | # )) 62 | 63 | # Read and split the file 64 | 65 | sample_rate, samples = input_data=wavfile.read(filename=input_filename, mmap=True) 66 | 67 | max_amplitude = np.iinfo(samples.dtype).max 68 | max_energy = energy([max_amplitude]) 69 | 70 | window_size = int(window_duration * sample_rate) 71 | step_size = int(step_duration * sample_rate) 72 | 73 | signal_windows = windows( 74 | signal=samples, 75 | window_size=window_size, 76 | step_size=step_size 77 | ) 78 | 79 | window_energy = (energy(w) / max_energy for w in tqdm( 80 | signal_windows, 81 | total=int(len(samples) / float(step_size)) 82 | )) 83 | 84 | window_silence = (e > silence_threshold for e in window_energy) 85 | 86 | cut_times = (r * step_duration for r in rising_edges(window_silence)) 87 | 88 | # This is the step that takes long, since we force the generators to run. 89 | print("Finding silences...") 90 | cut_samples = [int(t * sample_rate) for t in cut_times] 91 | cut_samples.append(-1) 92 | 93 | cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in xrange(len(cut_samples) - 1)] 94 | 95 | for i, start, stop in tqdm(cut_ranges): 96 | output_file_path = "{}_{:03d}.wav".format( 97 | os.path.join(output_dir, output_filename_prefix), 98 | i 99 | ) 100 | if not dry_run: 101 | print("Writing file ", output_file_path) 102 | wavfile.write( 103 | filename=output_file_path, 104 | rate=sample_rate, 105 | data=samples[start:stop] 106 | ) 107 | else: 108 | print("Not Writing file ", output_file_path) 109 | -------------------------------------------------------------------------------- /ipynb-htmls/conv1d (1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "conv1d.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU" 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "OTCB8AL-osBL", 21 | "colab_type": "code", 22 | "colab": { 23 | "base_uri": "https://localhost:8080/", 24 | "height": 35 25 | }, 26 | "outputId": "42e4be27-f0b0-4d9d-8353-c1b410aeca82" 27 | }, 28 | "source": [ 29 | "from google.colab import drive\n", 30 | "drive.mount('/content/drive')" 31 | ], 32 | "execution_count": 1, 33 | "outputs": [ 34 | { 35 | "output_type": "stream", 36 | "text": [ 37 | "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" 38 | ], 39 | "name": "stdout" 40 | } 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "koL6wrhIq_em", 47 | "colab_type": "code", 48 | "colab": { 49 | "base_uri": "https://localhost:8080/", 50 | "height": 138 51 | }, 52 | "outputId": "1d690b45-0ba9-42de-82bc-476a9b55bccb" 53 | }, 54 | "source": [ 55 | "\n", 56 | "from __future__ import print_function\n", 57 | "import numpy as np\n", 58 | "from sklearn.model_selection import train_test_split\n", 59 | "from sklearn.metrics import classification_report\n", 60 | "from time import time\n", 61 | "#np.random.seed(1337) # for reproducibility\n", 62 | "\n", 63 | "from keras.preprocessing import sequence\n", 64 | "from keras.models import Sequential\n", 65 | "from keras.layers.core import Dense, Dropout, Activation, Flatten\n", 66 | "from keras.layers.normalization import BatchNormalization\n", 67 | "from keras.layers.convolutional import Convolution1D, MaxPooling1D\n", 68 | "from keras.utils import np_utils\n", 69 | "from keras.callbacks import TensorBoard\n", 70 | "\n", 71 | "\n", 72 | "# set parameters:\n", 73 | "test_dim = 499\n", 74 | "maxlen = 100\n", 75 | "nb_filter = 256\n", 76 | "filter_length_1 = 10\n", 77 | "filter_length_2 = 5\n", 78 | "hidden_dims = 750\n", 79 | "nb_epoch = 12\n", 80 | "nb_classes = 2\n", 81 | "split_ratio = 0.15\n", 82 | "\n", 83 | "print('Loading data...')\n", 84 | "\n", 85 | "# X = np.load('/content/drive/My Drive/Colab Notebooks/data/numpy_vectors/x_test_mfcc_500_50:50_samples_sliced_out.npy')\n", 86 | "# y = np.load('/content/drive/My Drive/Colab Notebooks/data/numpy_vectors/y_label_500_50:50_samples_sliced_out.npy')\n", 87 | "X = np.load('/content/drive/My Drive/Colab Notebooks/data/numpy_vectors/x_3:1_samples_out.npy')\n", 88 | "y = np.load('/content/drive/My Drive/Colab Notebooks/data/numpy_vectors/y_3:1_samples_out.npy')\n", 89 | "print(X.shape)\n", 90 | "print(y.shape)" 91 | ], 92 | "execution_count": 2, 93 | "outputs": [ 94 | { 95 | "output_type": "stream", 96 | "text": [ 97 | "Using TensorFlow backend.\n" 98 | ], 99 | "name": "stderr" 100 | }, 101 | { 102 | "output_type": "display_data", 103 | "data": { 104 | "text/html": [ 105 | "

\n", 106 | "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.
\n", 107 | "We recommend you upgrade now \n", 108 | "or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x magic:\n", 109 | "more info.

\n" 110 | ], 111 | "text/plain": [ 112 | "" 113 | ] 114 | }, 115 | "metadata": { 116 | "tags": [] 117 | } 118 | }, 119 | { 120 | "output_type": "stream", 121 | "text": [ 122 | "Loading data...\n", 123 | "(3155, 499, 13)\n", 124 | "(3155,)\n" 125 | ], 126 | "name": "stdout" 127 | } 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "D9lzWC2zrkch", 134 | "colab_type": "code", 135 | "colab": {} 136 | }, 137 | "source": [ 138 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio)\n", 139 | "Y_train = y_train\n", 140 | "Y_test = y_test" 141 | ], 142 | "execution_count": 0, 143 | "outputs": [] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "metadata": { 148 | "id": "Zz0tpQ_kiQNo", 149 | "colab_type": "code", 150 | "colab": { 151 | "base_uri": "https://localhost:8080/", 152 | "height": 1141 153 | }, 154 | "outputId": "e3bec454-7207-4661-ae41-c37f0d82f754" 155 | }, 156 | "source": [ 157 | "import keras\n", 158 | "from keras.preprocessing.image import ImageDataGenerator\n", 159 | "from keras.models import Sequential\n", 160 | "from keras.layers import Conv1D, MaxPooling1D\n", 161 | "from keras.layers import Activation, Dropout, Flatten, Dense\n", 162 | "nb_train_samples = X.shape\n", 163 | "input_shape = (test_dim, 13)\n", 164 | "for batch_size in range(25, 26, 5):\n", 165 | " print('Build model...')\n", 166 | " model = Sequential()\n", 167 | "\n", 168 | " model = Sequential()\n", 169 | " model.add(Conv1D(32, (3), input_shape=input_shape))\n", 170 | " model.add(Activation('relu'))\n", 171 | " model.add(MaxPooling1D(pool_size=(2)))\n", 172 | "\n", 173 | " model.add(Conv1D(32, (3)))\n", 174 | " model.add(Activation('relu'))\n", 175 | " model.add(MaxPooling1D(pool_size=(2)))\n", 176 | "\n", 177 | " model.add(Conv1D(64, (3)))\n", 178 | " model.add(Activation('relu'))\n", 179 | " model.add(MaxPooling1D(pool_size=(2)))\n", 180 | "\n", 181 | " model.add(Flatten())\n", 182 | " model.add(Dense(64))\n", 183 | " model.add(Activation('relu'))\n", 184 | " model.add(Dropout(0.5))\n", 185 | " model.add(Dense(1))\n", 186 | " model.add(Activation('sigmoid'))\n", 187 | "\n", 188 | " model.compile(loss='binary_crossentropy',\n", 189 | " optimizer='rmsprop',\n", 190 | " metrics=['accuracy'])\n", 191 | " \n", 192 | " model.fit(X_train, Y_train, steps_per_epoch=nb_train_samples[0] // batch_size,\n", 193 | " nb_epoch=10, shuffle='true', verbose=1)\n", 194 | "\n", 195 | " Y_preds = model.predict(X_test)\n", 196 | " # for i in range(len(Y_preds)):\n", 197 | " # print(Y_preds[i], Y_test[i])\n", 198 | " score = model.evaluate(X_test, Y_test, verbose=1)\n", 199 | " print(score)" 200 | ], 201 | "execution_count": 4, 202 | "outputs": [ 203 | { 204 | "output_type": "stream", 205 | "text": [ 206 | "Build model...\n", 207 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", 208 | "\n", 209 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", 210 | "\n", 211 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", 212 | "\n", 213 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4267: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n", 214 | "\n", 215 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:148: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.\n", 216 | "\n", 217 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3733: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 218 | "Instructions for updating:\n", 219 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 220 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", 221 | "\n", 222 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3657: The name tf.log is deprecated. Please use tf.math.log instead.\n", 223 | "\n", 224 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", 225 | "Instructions for updating:\n", 226 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" 227 | ], 228 | "name": "stdout" 229 | }, 230 | { 231 | "output_type": "stream", 232 | "text": [ 233 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:37: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n" 234 | ], 235 | "name": "stderr" 236 | }, 237 | { 238 | "output_type": "stream", 239 | "text": [ 240 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", 241 | "\n", 242 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1020: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.\n", 243 | "\n", 244 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3005: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n", 245 | "\n", 246 | "Epoch 1/10\n", 247 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.\n", 248 | "\n", 249 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:197: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n", 250 | "\n", 251 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:207: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", 252 | "\n", 253 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:216: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.\n", 254 | "\n", 255 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:223: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.\n", 256 | "\n", 257 | "126/126 [==============================] - 20s 160ms/step - loss: 0.7433 - acc: 0.9211\n", 258 | "Epoch 2/10\n", 259 | "126/126 [==============================] - 18s 142ms/step - loss: 0.0579 - acc: 0.9902\n", 260 | "Epoch 3/10\n", 261 | "126/126 [==============================] - 18s 142ms/step - loss: 0.0239 - acc: 0.9971\n", 262 | "Epoch 4/10\n", 263 | "126/126 [==============================] - 18s 142ms/step - loss: 0.0182 - acc: 0.9978\n", 264 | "Epoch 5/10\n", 265 | "126/126 [==============================] - 18s 142ms/step - loss: 0.0271 - acc: 0.9969\n", 266 | "Epoch 6/10\n", 267 | "126/126 [==============================] - 18s 141ms/step - loss: 0.0440 - acc: 0.9947\n", 268 | "Epoch 7/10\n", 269 | "126/126 [==============================] - 18s 141ms/step - loss: 2.6129e-04 - acc: 0.9999\n", 270 | "Epoch 8/10\n", 271 | "126/126 [==============================] - 18s 141ms/step - loss: 0.0508 - acc: 0.9953\n", 272 | "Epoch 9/10\n", 273 | "126/126 [==============================] - 18s 141ms/step - loss: 0.0255 - acc: 0.9980\n", 274 | "Epoch 10/10\n", 275 | "126/126 [==============================] - 18s 141ms/step - loss: 0.0212 - acc: 0.9983\n", 276 | "474/474 [==============================] - 0s 247us/step\n", 277 | "[1.2402050405118628e-07, 1.0]\n" 278 | ], 279 | "name": "stdout" 280 | } 281 | ] 282 | } 283 | ] 284 | } -------------------------------------------------------------------------------- /ipynb-htmls/conv1d.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "conv1d.ipynb", 7 | "provenance": [], 8 | "private_outputs": true, 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "OTCB8AL-osBL", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "from google.colab import drive\n", 27 | "drive.mount('/content/drive')" 28 | ], 29 | "execution_count": 0, 30 | "outputs": [] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "metadata": { 35 | "id": "koL6wrhIq_em", 36 | "colab_type": "code", 37 | "colab": {} 38 | }, 39 | "source": [ 40 | "\n", 41 | "from __future__ import print_function\n", 42 | "import numpy as np\n", 43 | "from sklearn.model_selection import train_test_split\n", 44 | "from sklearn.metrics import classification_report\n", 45 | "from time import time\n", 46 | "#np.random.seed(1337) # for reproducibility\n", 47 | "\n", 48 | "from keras.preprocessing import sequence\n", 49 | "from keras.models import Sequential\n", 50 | "from keras.layers.core import Dense, Dropout, Activation, Flatten\n", 51 | "from keras.layers.normalization import BatchNormalization\n", 52 | "from keras.layers.convolutional import Convolution1D, MaxPooling1D\n", 53 | "from keras.utils import np_utils\n", 54 | "from keras.callbacks import TensorBoard\n", 55 | "\n", 56 | "\n", 57 | "# set parameters:\n", 58 | "test_dim = 499\n", 59 | "maxlen = 100\n", 60 | "nb_filter = 512\n", 61 | "filter_length_1 = 10\n", 62 | "filter_length_2 = 5\n", 63 | "hidden_dims = 750\n", 64 | "nb_epoch = 20\n", 65 | "nb_classes = 2\n", 66 | "split_ratio = 0.15\n", 67 | "\n", 68 | "print('Loading data...')\n", 69 | "\n", 70 | "X = np.load('/content/drive/My Drive/Colab Notebooks/data/numpy_vectors/x_test_mfcc_500_50:50_samples_sliced_out.npy')\n", 71 | "y = np.load('/content/drive/My Drive/Colab Notebooks/data/numpy_vectors/y_label_500_50:50_samples_sliced_out.npy')\n", 72 | "print(X.shape)\n", 73 | "print(y.shape)" 74 | ], 75 | "execution_count": 0, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "D9lzWC2zrkch", 82 | "colab_type": "code", 83 | "colab": {} 84 | }, 85 | "source": [ 86 | "\n", 87 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio)\n", 88 | "\n", 89 | "xts = X_train.shape\n", 90 | "#X_train = np.reshape(X_train, (xts[0], xts[1], 1))\n", 91 | "xtss = X_test.shape\n", 92 | "#X_test = np.reshape(X_test, (xtss[0], xtss[1], 1))\n", 93 | "yts = y_train.shape\n", 94 | "#y_train = np.reshape(y_train, (yts[0], 1))\n", 95 | "ytss = y_test.shape\n", 96 | "#y_test = np.reshape(y_test, (ytss[0], 1))\n", 97 | "\n", 98 | "print(len(X_train), 'train sequences')\n", 99 | "print(len(X_test), 'test sequences')\n", 100 | "\n", 101 | "Y_train = np_utils.to_categorical(y_train, nb_classes)\n", 102 | "Y_test = np_utils.to_categorical(y_test, nb_classes)\n", 103 | "\n", 104 | "# print('Pad sequences (samples x time)')\n", 105 | "# X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n", 106 | "# X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n", 107 | "# print('X_train shape:', X_train.shape)\n", 108 | "# print('X_test shape:', X_test.shape)\n" 109 | ], 110 | "execution_count": 0, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "9yKzDEgVroJf", 117 | "colab_type": "code", 118 | "colab": {} 119 | }, 120 | "source": [ 121 | "\n", 122 | "for batch_size in range(10, 11, 5):\n", 123 | " print('Build model...')\n", 124 | " model = Sequential()\n", 125 | "\n", 126 | " # we start off with an efficient embedding layer which maps\n", 127 | " # our vocab indices into embedding_dims dimensions\n", 128 | " # model.add(Embedding(max_features, embedding_dims, input_length=maxlen))\n", 129 | " # model.add(Dropout(0.25))\n", 130 | "\n", 131 | " # we add a Convolution1D, which will learn nb_filter\n", 132 | " # word group filters of size filter_length:\n", 133 | " model.add(Convolution1D(nb_filter=nb_filter,\n", 134 | " filter_length=filter_length_1,\n", 135 | " input_shape=(test_dim, 13),\n", 136 | " border_mode='valid',\n", 137 | " activation='relu'\n", 138 | " ))\n", 139 | " # we use standard max pooling (halving the output of the previous layer):\n", 140 | " model.add(BatchNormalization())\n", 141 | "\n", 142 | " model.add(Convolution1D(nb_filter=nb_filter,\n", 143 | " filter_length=5,\n", 144 | " border_mode='valid',\n", 145 | " activation='relu'\n", 146 | " ))\n", 147 | "\n", 148 | " model.add(BatchNormalization())\n", 149 | "\n", 150 | " model.add(MaxPooling1D(pool_length=2))\n", 151 | "\n", 152 | " model.add(Convolution1D(nb_filter=nb_filter,\n", 153 | " filter_length=25,\n", 154 | " border_mode='same',\n", 155 | " activation='relu'\n", 156 | " ))\n", 157 | "\n", 158 | " model.add(BatchNormalization())\n", 159 | "\n", 160 | " model.add(MaxPooling1D(pool_length=2))\n", 161 | "\n", 162 | " model.add(Convolution1D(nb_filter=nb_filter,\n", 163 | " filter_length=50,\n", 164 | " border_mode='same',\n", 165 | " activation='relu'\n", 166 | " ))\n", 167 | "\n", 168 | " model.add(BatchNormalization())\n", 169 | "\n", 170 | " model.add(MaxPooling1D(pool_length=2))\n", 171 | "\n", 172 | " model.add(Convolution1D(nb_filter=nb_filter,\n", 173 | " filter_length=2,\n", 174 | " border_mode='same',\n", 175 | " activation='relu'\n", 176 | " ))\n", 177 | "\n", 178 | " model.add(BatchNormalization())\n", 179 | "\n", 180 | " model.add(MaxPooling1D(pool_length=2))\n", 181 | "\n", 182 | " # We flatten the output of the conv layer,\n", 183 | " # so that we can add a vanilla dense layer:\n", 184 | " model.add(Flatten())\n", 185 | "\n", 186 | " # We add a vanilla hidden layer:\n", 187 | " # model.add(Dense(hidden_dims))\n", 188 | " model.add(Dropout(0.25))\n", 189 | " # model.add(Activation('relu'))\n", 190 | "\n", 191 | " model.add(Dense(1000))\n", 192 | " model.add(Activation('relu'))\n", 193 | " model.add(Dense(750))\n", 194 | " model.add(Activation('relu'))\n", 195 | " model.add(Dense(50))\n", 196 | " model.add(Activation('relu'))\n", 197 | " # We project onto a single unit output layer, and squash it with a sigmoid:\n", 198 | " model.add(Dense(nb_classes))\n", 199 | " model.add(Activation('softmax'))\n", 200 | "\n", 201 | " model.compile(loss='binary_crossentropy',\n", 202 | " optimizer='adam', metrics=['accuracy'])\n", 203 | "\n", 204 | " print(\"model/split = {} <> batchsize = {}\".format(split_ratio, batch_size))\n", 205 | " tensorboard = TensorBoard(log_dir=\"logs/split_{}_batchsize_{}\".format(split_ratio, batch_size))\n", 206 | "\n", 207 | " model.fit(X_train, Y_train, batch_size=batch_size,\n", 208 | " nb_epoch=nb_epoch, verbose=1, callbacks=[tensorboard]\t)\n", 209 | "\n", 210 | " # model.save('model_hin_tel_38_samples.h5')\n", 211 | "\n", 212 | " y_preds = model.predict(X_test)\n", 213 | " for i in range(len(y_preds)):\n", 214 | " print(y_preds[i], y_test[i])\n", 215 | " \n", 216 | " score = model.evaluate(X_test, Y_test, verbose=1)\n", 217 | " print(score)\n", 218 | " print(\"\\n**********************************\\n\")\n", 219 | "\n", 220 | "# print(classification_report(Y_test, Y_preds))" 221 | ], 222 | "execution_count": 0, 223 | "outputs": [] 224 | } 225 | ] 226 | } -------------------------------------------------------------------------------- /notebooks/pase.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 75, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Current Model keys: 69\n", 13 | "Current Pt keys: 69\n", 14 | "Loading matching keys: ['blocks.0.conv.low_hz_', 'blocks.0.conv.band_hz_', 'blocks.0.norm.weight', 'blocks.0.norm.bias', 'blocks.0.norm.running_mean', 'blocks.0.norm.running_var', 'blocks.0.norm.num_batches_tracked', 'blocks.0.act.weight', 'blocks.1.conv.weight', 'blocks.1.conv.bias', 'blocks.1.norm.weight', 'blocks.1.norm.bias', 'blocks.1.norm.running_mean', 'blocks.1.norm.running_var', 'blocks.1.norm.num_batches_tracked', 'blocks.1.act.weight', 'blocks.2.conv.weight', 'blocks.2.conv.bias', 'blocks.2.norm.weight', 'blocks.2.norm.bias', 'blocks.2.norm.running_mean', 'blocks.2.norm.running_var', 'blocks.2.norm.num_batches_tracked', 'blocks.2.act.weight', 'blocks.3.conv.weight', 'blocks.3.conv.bias', 'blocks.3.norm.weight', 'blocks.3.norm.bias', 'blocks.3.norm.running_mean', 'blocks.3.norm.running_var', 'blocks.3.norm.num_batches_tracked', 'blocks.3.act.weight', 'blocks.4.conv.weight', 'blocks.4.conv.bias', 'blocks.4.norm.weight', 'blocks.4.norm.bias', 'blocks.4.norm.running_mean', 'blocks.4.norm.running_var', 'blocks.4.norm.num_batches_tracked', 'blocks.4.act.weight', 'blocks.5.conv.weight', 'blocks.5.conv.bias', 'blocks.5.norm.weight', 'blocks.5.norm.bias', 'blocks.5.norm.running_mean', 'blocks.5.norm.running_var', 'blocks.5.norm.num_batches_tracked', 'blocks.5.act.weight', 'blocks.6.conv.weight', 'blocks.6.conv.bias', 'blocks.6.norm.weight', 'blocks.6.norm.bias', 'blocks.6.norm.running_mean', 'blocks.6.norm.running_var', 'blocks.6.norm.num_batches_tracked', 'blocks.6.act.weight', 'blocks.7.conv.weight', 'blocks.7.conv.bias', 'blocks.7.norm.weight', 'blocks.7.norm.bias', 'blocks.7.norm.running_mean', 'blocks.7.norm.running_var', 'blocks.7.norm.num_batches_tracked', 'blocks.7.act.weight', 'W.weight', 'W.bias', 'norm_out.running_mean', 'norm_out.running_var', 'norm_out.num_batches_tracked']\n", 15 | "torch.Size([1, 1, 100000])\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "from pase.models.frontend import wf_builder\n", 21 | "pase = wf_builder('pase/cfg/PASE.cfg')\n", 22 | "pase.eval()\n", 23 | "pase.load_pretrained('pase/PASE.ckpt', load_last=True, verbose=True)\n", 24 | "\n", 25 | "# Now we can forward waveforms as Torch tensors\n", 26 | "import torch\n", 27 | "x = torch.randn(1, 1, 100000)\n", 28 | "# y size will be (1, 100, 625), which are 625 frames of 100 dims each\n", 29 | "print(x.shape)\n", 30 | "y = pase(x)\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 41, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import tempfile\n", 40 | "import os\n", 41 | "import pydub\n", 42 | "import scipy\n", 43 | "import scipy.io.wavfile\n", 44 | "\n", 45 | "\n", 46 | "def read_mp3(file_path, as_float = False):\n", 47 | " \"\"\"\n", 48 | " Read an MP3 File into numpy data.\n", 49 | " :param file_path: String path to a file\n", 50 | " :param as_float: Cast data to float and normalize to [-1, 1]\n", 51 | " :return: Tuple(rate, data), where\n", 52 | " rate is an integer indicating samples/s\n", 53 | " data is an ndarray(n_samples, 2)[int16] if as_float = False\n", 54 | " otherwise ndarray(n_samples, 2)[float] in range [-1, 1]\n", 55 | " \"\"\"\n", 56 | "\n", 57 | " path, ext = os.path.splitext(file_path)\n", 58 | " assert ext=='.wav'\n", 59 | " mp3 = pydub.AudioSegment.from_wav(file_path)\n", 60 | " _, path = tempfile.mkstemp()\n", 61 | " mp3.export(path, format=\"wav\")\n", 62 | " rate, data = scipy.io.wavfile.read(path)\n", 63 | " os.remove(path)\n", 64 | " if as_float:\n", 65 | " data = data/(2**15)\n", 66 | " return rate, data" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 56, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "root = '/home/enigmaeth/Videos/accentPhase2/splits'\n", 76 | "def get_all_files():\n", 77 | " \"\"\"\n", 78 | " List all files recursively in the root specified by root\n", 79 | " \"\"\"\n", 80 | " files_list = []\n", 81 | " dirs = []\n", 82 | " import os\n", 83 | "\n", 84 | " for dirname, dirnames, filenames in os.walk(root):\n", 85 | " for filename in filenames:\n", 86 | " files_list.append(os.path.join(dirname, filename))\n", 87 | "\n", 88 | " # Advanced usage:\n", 89 | " # editing the 'dirnames' list will stop os.walk() from recursing into there.\n", 90 | " if '.git' in dirnames:\n", 91 | " # don't go into any .git directories.\n", 92 | " dirnames.remove('.git')\n", 93 | " \n", 94 | " return files_list\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 57, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "['/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_738.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_273.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_508.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_332.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_629.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_526.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_040.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_631.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_097.wav', '/home/enigmaeth/Videos/accentPhase2/splits/MSl/MSl_581.wav']\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "files = get_all_files()\n", 112 | "files = files[:10]\n", 113 | "print(files)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 78, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "import torch\n", 123 | "def run_pipe(n, as_float):\n", 124 | " \"\"\"\n", 125 | " returns pase vectors for set of files \n", 126 | " \"\"\" \n", 127 | " files = get_all_files()\n", 128 | " files = files[0:n]\n", 129 | " pase_vectors = []\n", 130 | " for file in files:\n", 131 | " file_rate, file_vec = read_mp3(file, as_float=as_float)\n", 132 | " file_vec_tensor = torch.from_numpy(file_vec)\n", 133 | " file_vec_tensor_flat = torch.flatten(file_vec_tensor)\n", 134 | " file_vec_tensor_view = file_vec_tensor_flat.view(1, 1, file_vec_tensor_flat.shape[0])\n", 135 | " print(file_vec_tensor_view.size())\n", 136 | " file_pase = pase(file_vec_tensor_view)\n", 137 | " pase_vectors.append(file_pase)\n", 138 | " \n", 139 | " return pase_vectors " 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 79, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "torch.Size([1, 1, 480000])\n" 152 | ] 153 | }, 154 | { 155 | "ename": "RuntimeError", 156 | "evalue": "\"reflection_pad1d\" not implemented for 'Short'", 157 | "output_type": "error", 158 | "traceback": [ 159 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 160 | "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", 161 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_pipe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 162 | "\u001b[0;32m\u001b[0m in \u001b[0;36mrun_pipe\u001b[0;34m(n, as_float)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mfile_vec_tensor_view\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfile_vec_tensor_flat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_vec_tensor_flat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_vec_tensor_view\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mfile_pase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_vec_tensor_view\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mpase_vectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_pase\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 163 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 164 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/PASE-0.1.dev0-py3.6.egg/pase/models/frontend.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0mdskips\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0mh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdenseskips\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# denseskips happen til the last but one layer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 165 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 166 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/PASE-0.1.dev0-py3.6.egg/pase/models/modules.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 824\u001b[0m \u001b[0mP\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mpad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpad\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 825\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpad_mode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 826\u001b[0;31m \u001b[0mh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 827\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'norm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 828\u001b[0m \u001b[0mh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mforward_norm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 167 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 168 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/PASE-0.1.dev0-py3.6.egg/pase/models/modules.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, waveforms)\u001b[0m\n\u001b[1;32m 688\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m x_p = F.pad(x, (self.kernel_size // 2,\n\u001b[0;32m--> 690\u001b[0;31m self.kernel_size // 2), mode=self.pad_mode)\n\u001b[0m\u001b[1;32m 691\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 692\u001b[0m \u001b[0mx_p\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 169 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mpad\u001b[0;34m(input, pad, mode, value)\u001b[0m\n\u001b[1;32m 2740\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpad\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'3D tensors expect 2 values for padding'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2741\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'reflect'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2742\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreflection_pad1d\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpad\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2743\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'replicate'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2744\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplication_pad1d\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpad\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 170 | "\u001b[0;31mRuntimeError\u001b[0m: \"reflection_pad1d\" not implemented for 'Short'" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "print(run_pipe(10, False))" 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.6.5" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 2 200 | } 201 | -------------------------------------------------------------------------------- /speech2vec/all_split.sh: -------------------------------------------------------------------------------- 1 | echo "GenX" 2 | # python3 gen_x.py 3 | echo "=======================================" 4 | echo "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 5 | 6 | cd ../classification 7 | echo "=======================================" 8 | echo "cnn_bilstm" 9 | echo "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 10 | python3 cnn_bilstm.py 11 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" 12 | echo "=======================================" 13 | 14 | echo "=======================================" 15 | echo "attention_lstm" 16 | echo "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 17 | python3 attention_lstm.py 18 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" 19 | echo "=======================================" 20 | 21 | echo "=======================================" 22 | echo "conv_1d_model" 23 | echo "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 24 | python3 conv_1d_model.py 25 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" 26 | echo "=======================================" 27 | 28 | echo "End" -------------------------------------------------------------------------------- /speech2vec/gen_x.py: -------------------------------------------------------------------------------- 1 | from mfcc import * 2 | import numpy as np 3 | 4 | folder = '../data/splits' 5 | 6 | x = make_class_array(folder) 7 | print(x.shape) 8 | X_file = '../data/numpy_vectors/x_test_mfcc_' + (folder.split('/'))[-1] 9 | 10 | print("saving labels to ", X_file) 11 | np.save(X_file, x) 12 | 13 | 14 | 15 | # filename = "english1.wav" 16 | 17 | # with open(filename, 'rb') as f: 18 | # print(read_in_audio(f)) 19 | 20 | # cd = make_class_array('/media/enigmaeth/My Passport/Datasets/Accent/clean_data') 21 | # print(cd.shape) 22 | # np.save('top_3_100_split_mfcc.npy', cd) 23 | # mf = make_mean_mfcc_df('/media/enigmaeth/My Passport/Datasets/Accent/sounds_wav') 24 | # print(mf.shape) 25 | # np.save('top_3_100_split_y.npy', mf) 26 | -------------------------------------------------------------------------------- /speech2vec/gen_y.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | def generate_y(folder): 5 | accents = {} 6 | counts = {} 7 | y = [] 8 | index = 0 9 | 10 | for filename in os.listdir(folder): 11 | name = ''.join([i for i in filename if not i.isdigit()]) 12 | name = name.split('_')[0] 13 | if name not in accents: 14 | accents[name] = index 15 | index += 1 16 | counts[name] = 0 17 | 18 | counts[name] += 1 19 | y.append(accents[name]) 20 | 21 | print(counts) 22 | print(accents) 23 | 24 | sorted_counts = sorted(counts, key=counts.get, reverse=True) 25 | for r in sorted_counts: 26 | print(r, counts[r]) 27 | 28 | np_y = np.reshape(np.array(y), (len(y), 1)) 29 | 30 | Y_file = '../data/numpy_vectors/y_label_'+ (folder.split('/'))[-1] 31 | print("saving labels to ", Y_file) 32 | np.save(Y_file, y) 33 | 34 | folder = "../data/splits" 35 | generate_y(folder) -------------------------------------------------------------------------------- /speech2vec/mfcc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from python_speech_features import mfcc 4 | from python_speech_features import logfbank 5 | import scipy.io.wavfile as wav 6 | from scipy.io.wavfile import write as wav_write 7 | import librosa 8 | import scipy 9 | from tqdm import tqdm 10 | # import scikits.samplerate 11 | import os 12 | 13 | 14 | ''' 15 | mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True) 16 | ''' 17 | # read in wav file, get out signal (np array) and sampling rate (int) 18 | def read_in_audio(filename): 19 | (rate, sig) = wav.read(filename) 20 | return sig, rate 21 | 22 | 23 | # read in signal, take absolute value and slice seconds 1-3 from beginning 24 | def get_two_secs(filename): 25 | sig, rate = read_in_audio(filename) 26 | abs_sig = np.abs(sig) 27 | two_secs = abs_sig[rate:3*rate] 28 | return two_secs 29 | 30 | # calculates moving average for a specified window (number of samples) 31 | def take_moving_average(sig, window_width): 32 | cumsum_vec = np.cumsum(np.insert(sig, 0, 0)) 33 | ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width])/float(window_width) 34 | return ma_vec 35 | 36 | # read in signal, change sample rate to outrate (samples/sec), use write_wav=True to save wav file to disk 37 | def downsample(filename, outrate=8000, write_wav = False): 38 | print(filename) 39 | (rate, sig) = wav.read(filename) 40 | down_sig = librosa.core.resample(sig * 1., rate, outrate, scale=True) 41 | if not write_wav: 42 | return down_sig, outrate 43 | if write_wav: 44 | wav_write('{}_down_{}.wav'.format(filename, outrate), outrate, down_sig) 45 | 46 | def librosa_downsample(filename, outrate=8000): 47 | y, s = librosa.load(filename, sr=8000) 48 | return y, s 49 | 50 | def custom_downsample(filename, outrate=8000): 51 | (rate, sig) = wav.read(filename) 52 | len_in_secs = len(sig) 53 | secs = len_in_secs/rate # Number of seconds in signal X 54 | samps = secs*outrate # Number of samples to downsample 55 | print(secs, samps) 56 | Y = scipy.signal.resample(sig , int(samps)) 57 | return Y, outrate 58 | 59 | # change total number of samps for downsampled file to n_samps by trimming or zero-padding and standardize them 60 | def make_standard_length(filename, n_samps=240000): 61 | down_sig, rate = librosa_downsample(filename) 62 | normed_sig = librosa.util.fix_length(down_sig, n_samps) 63 | normed_sig = (normed_sig - np.mean(normed_sig))/np.std(normed_sig) 64 | return normed_sig 65 | 66 | # from a folder containing wav files, normalize each, divide into num_splits-1 chunks and write the resulting np.arrays to a single matrix 67 | def make_split_audio_array(folder, num_splits = 5): 68 | """ 69 | returns numpy array of split audio for a folder 70 | """ 71 | lst = [] 72 | for filename in tqdm(os.listdir(folder)) : 73 | if filename.endswith('wav'): 74 | normed_sig = make_standard_length(filename) 75 | chunk = normed_sig.shape[0]/num_splits 76 | for i in range(num_splits - 1): 77 | lst.append(normed_sig[i*chunk:(i+2)*chunk]) 78 | lst = np.array(lst) 79 | lst = lst.reshape(lst.shape[0], -1) 80 | return lst 81 | 82 | # for input wav file outputs (13, 2999) mfcc np array 83 | def make_normed_mfcc(filename, outrate=8000): 84 | normed_sig = make_standard_length(filename) 85 | normed_mfcc_feat = mfcc(normed_sig, outrate) 86 | normed_mfcc_feat = normed_mfcc_feat.T 87 | return normed_mfcc_feat 88 | 89 | # make mfcc np array from wav file using librosa package 90 | def make_librosa_mfcc(filename): 91 | y, sr = librosa.load(filename) 92 | mfcc_feat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) 93 | return mfcc_feat 94 | 95 | # make mfcc np array from wav file using speech features package 96 | def make_mfcc(filename): 97 | (rate, sig) = wav.read(filename) 98 | mfcc_feat = mfcc(sig, rate) 99 | mfcc_feat = mfcc_feat.T 100 | return mfcc_feat 101 | 102 | # for folder containing wav files, output numpy array of normed mfcc 103 | def make_class_array(folder): 104 | lst = [] 105 | files = os.listdir(folder) 106 | count_files = len(files) 107 | for file_path in tqdm(files): 108 | filename = os.path.join(folder, file_path) 109 | lst.append(make_normed_mfcc(filename)) 110 | class_array = np.array(lst) 111 | class_array = np.reshape(class_array, (class_array.shape[0], class_array.shape[2], class_array.shape[1])) 112 | return class_array 113 | 114 | # read in wav file, output (1,13) numpy array of mean mfccs for each of 13 features 115 | def make_mean_mfcc(filename): 116 | try: 117 | (rate, sig) = wav.read(filename) 118 | mfcc_feat = mfcc(sig, rate) 119 | avg_mfcc = np.mean(mfcc_feat, axis = 0) 120 | return avg_mfcc 121 | except: 122 | pass 123 | 124 | # write new csv corresponding to dataframe of given language and gender 125 | def make_df_language_gender(df, language, gender): 126 | newdf = df.query("native_language == @language").query("sex == @gender") 127 | newdf.to_csv('df_{}_{}.csv'.format(language, gender)) 128 | 129 | # write new directories to disk containing the male and female speakers from the most common languages 130 | def make_folders_from_csv(): 131 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 132 | for lang in top_15_langs: 133 | os.makedirs('{}/{}_male'.format(lang, lang)) 134 | os.makedirs('{}/{}_female'.format(lang, lang)) 135 | 136 | # copy files to the corresponding directories 137 | def copy_files_from_csv(): 138 | top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese'] 139 | for lang in top_15_langs: 140 | df_male = pd.read_csv('df_{}_male.csv'.format(lang)) 141 | df_female = pd.read_csv('df_{}_female.csv'.format(lang)) 142 | m_list = df_male['filename'].values 143 | f_list = df_female['filename'].values 144 | for filename in f_list: 145 | shutil.copy2('big_langs/{}/{}.wav'.format(lang, filename), 'big_langs/{}/{}_female/{}.wav'.format(lang, lang, filename)) 146 | 147 | # input folder of wav files, output pandas dataframe of mean mfcc values 148 | def make_mean_mfcc_df(folder): 149 | norms = [] 150 | for file_path in os.listdir(folder): 151 | filename = os.path.join(folder, file_path) 152 | (rate, sig) = wav.read(filename) 153 | mfcc_feat = mfcc(sig, rate) 154 | mean_mfcc = np.mean(mfcc_feat, axis = 0) 155 | #mean_mfcc = np.reshape(mean_mfcc, (1,13)) 156 | norms.append(mean_mfcc) 157 | flat = [a.ravel() for a in norms] 158 | stacked = np.vstack(flat) 159 | df = pd.DataFrame(stacked) 160 | return df 161 | -------------------------------------------------------------------------------- /speech2vec/mp3_getter.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import time 3 | import shutil 4 | from requests import get 5 | from bs4 import BeautifulSoup 6 | import pandas as pd 7 | import numpy as np 8 | 9 | 10 | # from the accent.gmu website, pass in list of languages to scrape mp3 files and save them to disk 11 | def mp3getter(lst): 12 | links = [] 13 | for j in range(len(lst)): 14 | for i in range(1,lst[j][1]+1): 15 | try: 16 | print(" fetching record ", i, " for language ", j) 17 | # urllib.request.urlretrieve("http://accent.gmu.edu/soundtracks/{0}{1}.mp3".format(lst[j][0], i), '../sounds/{0}{1}.mp3'.format(lst[j][0], i)) 18 | # print("http://accent.gmu.edu/soundtracks/{0}{1}.mp3".format(lst[j][0], i)) 19 | with open('links.txt', 'a') as f: 20 | f.writelines("http://accent.gmu.edu/soundtracks/{0}{1}.mp3".format(lst[j][0], i)) 21 | f.writelines('\n') 22 | except: 23 | time.sleep(2) 24 | 25 | 26 | # from list of languages, return urls of each language landing page 27 | def lang_pages(lst): 28 | urls=[] 29 | for lang in lst: 30 | urls.append('http://accent.gmu.edu/browse_language.php?function=find&language={}'.format(lang)) 31 | return urls 32 | 33 | #output: 34 | # 35 | # ['http://accent.gmu.edu/browse_language.php?function=find&language=amharic', 36 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=arabic', 37 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=bengali', 38 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=bulgarian', 39 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=cantonese', 40 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=dutch', 41 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=english', 42 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=farsi', 43 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=french', 44 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=german', 45 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=greek', 46 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=hindi', 47 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=italian', 48 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=japanese', 49 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=korean', 50 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=kurdish', 51 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=macedonian', 52 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=mandarin', 53 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=miskito', 54 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=nepali', 55 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=pashto', 56 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=polish', 57 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=portuguese', 58 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=punjabi', 59 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=romanian', 60 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=russian', 61 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=serbian', 62 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=spanish', 63 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=swedish', 64 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=tagalog', 65 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=thai', 66 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=turkish', 67 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=ukrainian', 68 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=urdu', 69 | # 'http://accent.gmu.edu/browse_language.php?function=find&language=vietnamese'] 70 | 71 | # from http://accent.gmu.edu/browse_language.php, return list of languages 72 | def get_languages(): 73 | url = "http://accent.gmu.edu/browse_language.php" 74 | html = get(url) 75 | soup = BeautifulSoup(html.content, 'html.parser') 76 | languages = [] 77 | language_lists = soup.findAll('ul', attrs={'class': 'languagelist'}) 78 | for ul in language_lists: 79 | for li in ul.findAll('li'): 80 | languages.append(li.text) 81 | return languages 82 | 83 | # from list of languages, return list of urls 84 | def get_language_urls(lst): 85 | urls = [] 86 | for language in lst: 87 | urls.append('http://accent.gmu.edu/browse_language.php?function=find&language=' + language) 88 | return urls 89 | 90 | # from language, get the number of speakers of that language 91 | def get_num(language): 92 | url = 'http://accent.gmu.edu/browse_language.php?function=find&language=' + language 93 | try: 94 | html = get(url) 95 | soup = BeautifulSoup(html.content, 'html.parser') 96 | test = soup.find_all('div', attrs={'class': 'content'}) 97 | num = int(test[0].find('h5').text.split()[2]) 98 | except: 99 | num = 0 100 | return num 101 | 102 | # from list of languages, return list of tuples (LANGUAGE, LANGUAGE_NUM_SPEAKERS) for mp3getter, ignoring languages 103 | # with 0 speakers 104 | def get_formatted_languages(languages): 105 | formatted_languages = [] 106 | for language in languages: 107 | print("processing language: ", language) 108 | num = get_num(language) 109 | if num != 0: 110 | formatted_languages.append((language,num)) 111 | return formatted_languages 112 | 113 | # from each language whose url is contained in the above list, save the number of speakers of that language to a list 114 | def get_nums(lst): 115 | nums = [] 116 | for url in lst: 117 | html = get(url) 118 | soup = BeautifulSoup(html.content, 'html.parser') 119 | test = soup.find_all('div', attrs={'class': 'content'}) 120 | nums.append(int(test[0].find('h5').text.split()[2])) 121 | return nums 122 | 123 | def get_speaker_info(start, stop): 124 | ''' 125 | Inputs: two integers, corresponding to min and max speaker id number per language 126 | Outputs: Pandas Dataframe containing speaker filename, birthplace, native_language, age, sex, age_onset of English 127 | ''' 128 | 129 | user_data = [] 130 | for num in range(start,stop): 131 | info = {'speakerid': num, 'filename': 0, 'birthplace':1, 'native_language': 2, 'age':3, 'sex':4, 'age_onset':5} 132 | url = "http://accent.gmu.edu/browse_language.php?function=detail&speakerid={}".format(num) 133 | html = get(url) 134 | soup = BeautifulSoup(html.content, 'html.parser') 135 | body = soup.find_all('div', attrs={'class': 'content'}) 136 | try: 137 | info['filename']=str(body[0].find('h5').text.split()[0]) 138 | bio_bar = soup.find_all('ul', attrs={'class':'bio'}) 139 | info['birthplace'] = str(bio_bar[0].find_all('li')[0].text)[13:-6] 140 | info['native_language'] = str(bio_bar[0].find_all('li')[1].text.split()[2]) 141 | info['age'] = float(bio_bar[0].find_all('li')[3].text.split()[2].strip(',')) 142 | info['sex'] = str(bio_bar[0].find_all('li')[3].text.split()[3].strip()) 143 | info['age_onset'] = float(bio_bar[0].find_all('li')[4].text.split()[4].strip()) 144 | user_data.append(info) 145 | except: 146 | info['filename'] = '' 147 | info['birthplace'] = '' 148 | info['native_language'] = '' 149 | info['age'] = '' 150 | info['sex'] = '' 151 | info['age_onset'] = '' 152 | user_data.append(info) 153 | df = pd.DataFrame(user_data) 154 | df.to_csv('speaker_info_{}.csv'.format(stop)) 155 | return df 156 | 157 | # copy files from one list of wav files to a specified location 158 | def copy_files(lst, path): 159 | for filename in lst: 160 | shutil.copy2('{}.wav'.format(filename), '{}/{}.wav'.format(path, filename)) 161 | 162 | 163 | if __name__ == '__main__': 164 | # lst = get_languages() 165 | # print(len(lst)) 166 | # lert = get_formatted_languages(lst) 167 | # print(lert) 168 | lert = [('afrikaans', 6), ('agni', 1), ('akan', 13), ('albanian', 11), ('amazigh', 2), ('american sign language', 2), ('amharic', 23), ('anyin', 1), ('arabic', 153), ('armenian', 8), ('azerbaijani', 5), ('azerbaijani, south', 2), ('azeri turk', 2), ('bafang', 2), ('baga', 1), ('bahasa indonesia', 12), ('bai', 1), ('balant', 1), ('balanta ganja', 1), ('bamanankan', 5), ('bambara', 5), ('bamun', 1), ('bangla', 1), ('bari', 2), ('basque', 2), ('bavarian', 2), ('belarusan', 3), ('bengali', 19), ('bosnian', 12), ('bulgarian', 19), ('burmese', 2), ('cameroon creole english', 1), ('cantonese', 31), ('carolinian', 1), ('catalan', 5), ('cebuano', 1), ('chaldean', 2), ('chaldean neo aramaic', 1), ('chamorro', 1), ('chichewa', 1), ('chin, mizo', 1), ('chinese', 167), ('chittagonian', 1), ('croatian', 8), ('danish', 9), ('dari', 8), ('dholuo', 2), ('dinka', 1), ('djola', 1), ('dutch', 50), ('eastern farsi', 2), ('ebira', 1), ('edo', 1), ('english', 618), ('estonian', 17), ('ewe', 3), ('fang', 1), ('fanti', 3), ('faroese', 1), ('farsi', 30), ('fataluku', 1), ('fefe', 1), ('fijian', 3), ('filipino', 2), ('finnish', 15), ('flemish', 5), ('french', 69), ('frisian', 1), ('fulani', 1), ('fulfulde adamawa', 1), ('ga', 6), ('gan', 1), ('ganda', 3), ('garifuna', 2), ('gedeo', 1), ('georgian', 5), ('german', 38), ('greek', 15), ('gujarati', 16), ('gusii', 2), ('hadiyya', 2), ('hainanese', 1), ('haitian creole french', 7), ('hakka', 3), ('hausa', 10), ("hawai'i creole english", 2), ("hawai'ian pidgin", 2), ('hebrew', 9), ('hijazi', 17), ('hiligaynon', 2), ('hindi', 31), ('hindi-urdu', 31), ('hindko', 1), ('hmong', 2), ('hmong daw', 2), ('home sign', 1), ('hungarian', 11), ('ibibio', 3), ('icelandic', 3), ('ife', 1), ('igbo', 3), ('ikalanga', 1), ('ilonggo', 2), ('indonesian', 12), ('irish', 1), ('irish gaelic', 1), ('italian', 37), ('jamaican creole english', 4), ('japanese', 34), ('javanese', 1), ('jola', 1), ('kabyle', 1), ('kalanga', 1), ('kamba', 1), ('kambaata', 3), ('kamtok', 1), ('kannada', 9), ('kanuri', 1), ('kazakh', 4), ('kembata', 3), ('khalkha mongol', 8), ('khasonke', 1), ('khmer', 7), ('kikongo', 2), ('kikuyu', 5), ('kinyarwanda', 1), ('kirghiz', 3), ('kirundi', 1), ('kisii', 2), ('kiswahili', 11), ('klao', 1), ('kongo', 2), ('konkani', 3), ('korean', 90), ('krio', 6), ('kru', 1), ('kurdi', 4), ('kurdish', 10), ('kurmanji', 1), ('kyrgyz', 3), ('lamaholot', 1), ('lamotrekese', 1), ('lao', 3), ('latvian', 3), ('liberian english', 2), ('liberian pidgin english', 2), ('lingala', 1), ('lithuanian', 7), ('luba-kasai', 1), ('luganda', 3), ('luo', 3), ('luxembourgeois', 1), ('macedonian', 26), ('malagasy', 1), ('malay', 5), ('malayalam', 7), ('maltese', 2), ('mancagne', 1), ('mandarin', 115), ('mandingo', 1), ('mandingue', 1), ('mandinka', 1), ('maninkakan', 1), ('mankanya', 1), ('manual communication', 1), ('marathi', 9), ('mauritian', 3), ('mende', 3), ('miskito', 11), ('mizo', 1), ('moba', 1), ('mongolian', 9), ('montenegrin', 1), ('moore', 1), ('morisyen', 2), ('mortlockese', 1), ('najdi', 26), ('nama', 1), ('nandi', 1), ('naxi', 1), ('ndebele', 1), ('nepali', 14), ('newar', 1), ('newari', 1), ('ngemba', 2), ('nicaragua creole english', 4), ('northern sotho', 1), ('norwegian', 7), ('nuer', 1), ('nyanja', 1), ('omani arabic', 1), ('oriya', 2), ('oromo', 3), ('ossetic', 1), ('pahari', 2), ('panjabi', 12), ('papiamentu', 2), ('pashto', 10), ('patois', 4), ('persian', 27), ('pidgin english', 1), ('pohnpeian', 1), ('polish', 39), ('poonchi', 1), ('portuguese', 60), ('pulaar', 3), ('punjabi', 12), ('quechua', 2), ('romanian', 23), ('rotuman', 2), ('rundi', 1), ('russian', 76), ('rwanda', 1), ("sa'a", 1), ('sardinian', 1), ('sarua', 1), ('satawalese', 2), ('sepedi', 1), ('serbian', 19), ('serer', 1), ('serer sine', 1), ('sesotho', 1), ('setswana', 2), ('shan', 1), ('shilluk', 1), ('shona', 2), ('sicilian', 1), ('sindhi', 1), ('sinhala', 7), ('sinhalese', 7), ('slovak', 6), ('slovenian', 2), ('somali', 7), ('sotho', 1), ('spanish', 206), ('sunda', 1), ('sundanese', 1), ('susu', 1), ('swahili', 11), ('swedish', 22), ('swiss german', 7), ('sylheti', 1), ('synthesized', 4), ('tagalog', 24), ('taishan', 1), ('taiwanese', 9), ('tajiki', 14), ('tamajeq', 2), ('tamazight', 1), ('tamil', 13), ('tatar', 1), ('telugu', 13), ('temne', 1), ('teochew', 1), ('tetum', 1), ('tetun-dili', 1), ('thai', 20), ('tibetan', 4), ('tigre', 1), ('tigrigna', 9), ('tok pisin', 1), ('tshiluba', 1), ('tswana', 2), ('turkish', 38), ('turkmen', 2), ('twi', 9), ('ukrainian', 11), ('urdu', 25), ('uyghur', 5), ('uzbek', 5), ('vietnamese', 27), ('vlaams', 4), ('voro', 3), ('wali', 1), ('woleaian', 1), ('wolof', 6), ('wu', 3), ('xasonga', 1), ('xiang', 4), ('yakut', 1), ('yapese', 1), ('yiddish', 5), ('yoruba', 5), ('yue', 1), ('zulu', 1)] 169 | 170 | mp3getter(lert) --------------------------------------------------------------------------------