├── .gitignore ├── .gitmodules ├── Pytorch-CoreML-Sound-Classification.xcodeproj ├── project.pbxproj ├── project.xcworkspace │ ├── contents.xcworkspacedata │ └── xcshareddata │ │ └── IDEWorkspaceChecks.plist ├── xcshareddata │ └── xcschemes │ │ └── Pytorch-CoreML-Sound-Classification.xcscheme └── xcuserdata │ └── gerald.xcuserdatad │ ├── xcdebugger │ └── Breakpoints_v2.xcbkptlist │ └── xcschemes │ └── xcschememanagement.plist ├── Pytorch-CoreML-Sound-Classification ├── AppDelegate.swift ├── Assets.xcassets │ ├── AppIcon.appiconset │ │ └── Contents.json │ └── Contents.json ├── Base.lproj │ ├── LaunchScreen.storyboard │ └── Main.storyboard ├── ConvertSpectrogram.swift ├── DrawSpecView.swift ├── Info.plist ├── PANN.mlmodel ├── PANN_labels.json ├── SceneDelegate.swift └── ViewController.swift ├── Pytorch-CoreML-Sound-ClassificationTests ├── Info.plist ├── PANN_out.ring_hello.json ├── Pytorch_CoreML_Sound_ClassificationTests.swift └── ring_hello.wav ├── README.md └── python ├── export.log ├── export.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *~ 4 | xcuserdata 5 | *.pth -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "python/audioset_tagging_cnn"] 2 | path = python/audioset_tagging_cnn 3 | url = https://github.com/qiuqiangkong/audioset_tagging_cnn 4 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 50; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 741B2DA32496E94800020939 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DA22496E94800020939 /* AppDelegate.swift */; }; 11 | 741B2DA52496E94800020939 /* SceneDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DA42496E94800020939 /* SceneDelegate.swift */; }; 12 | 741B2DA72496E94800020939 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DA62496E94800020939 /* ViewController.swift */; }; 13 | 741B2DAA2496E94800020939 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 741B2DA82496E94800020939 /* Main.storyboard */; }; 14 | 741B2DAC2496E94B00020939 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 741B2DAB2496E94B00020939 /* Assets.xcassets */; }; 15 | 741B2DAF2496E94B00020939 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 741B2DAD2496E94B00020939 /* LaunchScreen.storyboard */; }; 16 | 741B2DBA2496E94B00020939 /* Pytorch_CoreML_Sound_ClassificationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DB92496E94B00020939 /* Pytorch_CoreML_Sound_ClassificationTests.swift */; }; 17 | 741B2DCC2496EC2700020939 /* DrawSpecView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DCB2496EC2700020939 /* DrawSpecView.swift */; }; 18 | 741B2DCE2496ED5100020939 /* ConvertSpectrogram.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DCD2496ED5100020939 /* ConvertSpectrogram.swift */; }; 19 | 741B2DCF2496ED5100020939 /* ConvertSpectrogram.swift in Sources */ = {isa = PBXBuildFile; fileRef = 741B2DCD2496ED5100020939 /* ConvertSpectrogram.swift */; }; 20 | 74C725A024A982D40010AC26 /* ring_hello.wav in Resources */ = {isa = PBXBuildFile; fileRef = 74C7259F24A982D40010AC26 /* ring_hello.wav */; }; 21 | 74C725A324A982E20010AC26 /* PANN.mlmodel in Sources */ = {isa = PBXBuildFile; fileRef = 74C725A124A982E20010AC26 /* PANN.mlmodel */; }; 22 | 74C725A424A982E20010AC26 /* PANN.mlmodel in Sources */ = {isa = PBXBuildFile; fileRef = 74C725A124A982E20010AC26 /* PANN.mlmodel */; }; 23 | 74C725A524A982E20010AC26 /* PANN_labels.json in Resources */ = {isa = PBXBuildFile; fileRef = 74C725A224A982E20010AC26 /* PANN_labels.json */; }; 24 | 74C725A624A982E20010AC26 /* PANN_labels.json in Resources */ = {isa = PBXBuildFile; fileRef = 74C725A224A982E20010AC26 /* PANN_labels.json */; }; 25 | 74C725A824A982ED0010AC26 /* PANN_out.ring_hello.json in Resources */ = {isa = PBXBuildFile; fileRef = 74C725A724A982ED0010AC26 /* PANN_out.ring_hello.json */; }; 26 | /* End PBXBuildFile section */ 27 | 28 | /* Begin PBXContainerItemProxy section */ 29 | 741B2DB62496E94B00020939 /* PBXContainerItemProxy */ = { 30 | isa = PBXContainerItemProxy; 31 | containerPortal = 741B2D972496E94800020939 /* Project object */; 32 | proxyType = 1; 33 | remoteGlobalIDString = 741B2D9E2496E94800020939; 34 | remoteInfo = "Pytorch-CoreML-Sound-Classification"; 35 | }; 36 | /* End PBXContainerItemProxy section */ 37 | 38 | /* Begin PBXFileReference section */ 39 | 741B2D9F2496E94800020939 /* Pytorch-CoreML-Sound-Classification.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "Pytorch-CoreML-Sound-Classification.app"; sourceTree = BUILT_PRODUCTS_DIR; }; 40 | 741B2DA22496E94800020939 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = ""; }; 41 | 741B2DA42496E94800020939 /* SceneDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneDelegate.swift; sourceTree = ""; }; 42 | 741B2DA62496E94800020939 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = ""; }; 43 | 741B2DA92496E94800020939 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = ""; }; 44 | 741B2DAB2496E94B00020939 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; 45 | 741B2DAE2496E94B00020939 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = ""; }; 46 | 741B2DB02496E94B00020939 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 47 | 741B2DB52496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = "Pytorch-CoreML-Sound-ClassificationTests.xctest"; sourceTree = BUILT_PRODUCTS_DIR; }; 48 | 741B2DB92496E94B00020939 /* Pytorch_CoreML_Sound_ClassificationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Pytorch_CoreML_Sound_ClassificationTests.swift; sourceTree = ""; }; 49 | 741B2DBB2496E94B00020939 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 50 | 741B2DCB2496EC2700020939 /* DrawSpecView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DrawSpecView.swift; sourceTree = ""; }; 51 | 741B2DCD2496ED5100020939 /* ConvertSpectrogram.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ConvertSpectrogram.swift; sourceTree = ""; }; 52 | 74C7259F24A982D40010AC26 /* ring_hello.wav */ = {isa = PBXFileReference; lastKnownFileType = audio.wav; path = ring_hello.wav; sourceTree = ""; }; 53 | 74C725A124A982E20010AC26 /* PANN.mlmodel */ = {isa = PBXFileReference; lastKnownFileType = file.mlmodel; path = PANN.mlmodel; sourceTree = ""; }; 54 | 74C725A224A982E20010AC26 /* PANN_labels.json */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.json; path = PANN_labels.json; sourceTree = ""; }; 55 | 74C725A724A982ED0010AC26 /* PANN_out.ring_hello.json */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.json; path = PANN_out.ring_hello.json; sourceTree = ""; }; 56 | /* End PBXFileReference section */ 57 | 58 | /* Begin PBXFrameworksBuildPhase section */ 59 | 741B2D9C2496E94800020939 /* Frameworks */ = { 60 | isa = PBXFrameworksBuildPhase; 61 | buildActionMask = 2147483647; 62 | files = ( 63 | ); 64 | runOnlyForDeploymentPostprocessing = 0; 65 | }; 66 | 741B2DB22496E94B00020939 /* Frameworks */ = { 67 | isa = PBXFrameworksBuildPhase; 68 | buildActionMask = 2147483647; 69 | files = ( 70 | ); 71 | runOnlyForDeploymentPostprocessing = 0; 72 | }; 73 | /* End PBXFrameworksBuildPhase section */ 74 | 75 | /* Begin PBXGroup section */ 76 | 741B2D962496E94800020939 = { 77 | isa = PBXGroup; 78 | children = ( 79 | 741B2DA12496E94800020939 /* Pytorch-CoreML-Sound-Classification */, 80 | 741B2DB82496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests */, 81 | 741B2DA02496E94800020939 /* Products */, 82 | ); 83 | sourceTree = ""; 84 | }; 85 | 741B2DA02496E94800020939 /* Products */ = { 86 | isa = PBXGroup; 87 | children = ( 88 | 741B2D9F2496E94800020939 /* Pytorch-CoreML-Sound-Classification.app */, 89 | 741B2DB52496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests.xctest */, 90 | ); 91 | name = Products; 92 | sourceTree = ""; 93 | }; 94 | 741B2DA12496E94800020939 /* Pytorch-CoreML-Sound-Classification */ = { 95 | isa = PBXGroup; 96 | children = ( 97 | 741B2DA22496E94800020939 /* AppDelegate.swift */, 98 | 741B2DA42496E94800020939 /* SceneDelegate.swift */, 99 | 741B2DA62496E94800020939 /* ViewController.swift */, 100 | 741B2DCD2496ED5100020939 /* ConvertSpectrogram.swift */, 101 | 741B2DCB2496EC2700020939 /* DrawSpecView.swift */, 102 | 74C725A224A982E20010AC26 /* PANN_labels.json */, 103 | 74C725A124A982E20010AC26 /* PANN.mlmodel */, 104 | 741B2DA82496E94800020939 /* Main.storyboard */, 105 | 741B2DAB2496E94B00020939 /* Assets.xcassets */, 106 | 741B2DAD2496E94B00020939 /* LaunchScreen.storyboard */, 107 | 741B2DB02496E94B00020939 /* Info.plist */, 108 | ); 109 | path = "Pytorch-CoreML-Sound-Classification"; 110 | sourceTree = ""; 111 | }; 112 | 741B2DB82496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests */ = { 113 | isa = PBXGroup; 114 | children = ( 115 | 74C7259F24A982D40010AC26 /* ring_hello.wav */, 116 | 74C725A724A982ED0010AC26 /* PANN_out.ring_hello.json */, 117 | 741B2DB92496E94B00020939 /* Pytorch_CoreML_Sound_ClassificationTests.swift */, 118 | 741B2DBB2496E94B00020939 /* Info.plist */, 119 | ); 120 | path = "Pytorch-CoreML-Sound-ClassificationTests"; 121 | sourceTree = ""; 122 | }; 123 | /* End PBXGroup section */ 124 | 125 | /* Begin PBXNativeTarget section */ 126 | 741B2D9E2496E94800020939 /* Pytorch-CoreML-Sound-Classification */ = { 127 | isa = PBXNativeTarget; 128 | buildConfigurationList = 741B2DBE2496E94B00020939 /* Build configuration list for PBXNativeTarget "Pytorch-CoreML-Sound-Classification" */; 129 | buildPhases = ( 130 | 741B2D9B2496E94800020939 /* Sources */, 131 | 741B2D9C2496E94800020939 /* Frameworks */, 132 | 741B2D9D2496E94800020939 /* Resources */, 133 | ); 134 | buildRules = ( 135 | ); 136 | dependencies = ( 137 | ); 138 | name = "Pytorch-CoreML-Sound-Classification"; 139 | productName = "Pytorch-CoreML-Sound-Classification"; 140 | productReference = 741B2D9F2496E94800020939 /* Pytorch-CoreML-Sound-Classification.app */; 141 | productType = "com.apple.product-type.application"; 142 | }; 143 | 741B2DB42496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests */ = { 144 | isa = PBXNativeTarget; 145 | buildConfigurationList = 741B2DC12496E94B00020939 /* Build configuration list for PBXNativeTarget "Pytorch-CoreML-Sound-ClassificationTests" */; 146 | buildPhases = ( 147 | 741B2DB12496E94B00020939 /* Sources */, 148 | 741B2DB22496E94B00020939 /* Frameworks */, 149 | 741B2DB32496E94B00020939 /* Resources */, 150 | ); 151 | buildRules = ( 152 | ); 153 | dependencies = ( 154 | 741B2DB72496E94B00020939 /* PBXTargetDependency */, 155 | ); 156 | name = "Pytorch-CoreML-Sound-ClassificationTests"; 157 | productName = "Pytorch-CoreML-Sound-ClassificationTests"; 158 | productReference = 741B2DB52496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests.xctest */; 159 | productType = "com.apple.product-type.bundle.unit-test"; 160 | }; 161 | /* End PBXNativeTarget section */ 162 | 163 | /* Begin PBXProject section */ 164 | 741B2D972496E94800020939 /* Project object */ = { 165 | isa = PBXProject; 166 | attributes = { 167 | LastSwiftUpdateCheck = 1140; 168 | LastUpgradeCheck = 1140; 169 | ORGANIZATIONNAME = Gerald; 170 | TargetAttributes = { 171 | 741B2D9E2496E94800020939 = { 172 | CreatedOnToolsVersion = 11.4.1; 173 | }; 174 | 741B2DB42496E94B00020939 = { 175 | CreatedOnToolsVersion = 11.4.1; 176 | TestTargetID = 741B2D9E2496E94800020939; 177 | }; 178 | }; 179 | }; 180 | buildConfigurationList = 741B2D9A2496E94800020939 /* Build configuration list for PBXProject "Pytorch-CoreML-Sound-Classification" */; 181 | compatibilityVersion = "Xcode 9.3"; 182 | developmentRegion = en; 183 | hasScannedForEncodings = 0; 184 | knownRegions = ( 185 | en, 186 | Base, 187 | ); 188 | mainGroup = 741B2D962496E94800020939; 189 | productRefGroup = 741B2DA02496E94800020939 /* Products */; 190 | projectDirPath = ""; 191 | projectRoot = ""; 192 | targets = ( 193 | 741B2D9E2496E94800020939 /* Pytorch-CoreML-Sound-Classification */, 194 | 741B2DB42496E94B00020939 /* Pytorch-CoreML-Sound-ClassificationTests */, 195 | ); 196 | }; 197 | /* End PBXProject section */ 198 | 199 | /* Begin PBXResourcesBuildPhase section */ 200 | 741B2D9D2496E94800020939 /* Resources */ = { 201 | isa = PBXResourcesBuildPhase; 202 | buildActionMask = 2147483647; 203 | files = ( 204 | 741B2DAF2496E94B00020939 /* LaunchScreen.storyboard in Resources */, 205 | 741B2DAC2496E94B00020939 /* Assets.xcassets in Resources */, 206 | 741B2DAA2496E94800020939 /* Main.storyboard in Resources */, 207 | 74C725A524A982E20010AC26 /* PANN_labels.json in Resources */, 208 | ); 209 | runOnlyForDeploymentPostprocessing = 0; 210 | }; 211 | 741B2DB32496E94B00020939 /* Resources */ = { 212 | isa = PBXResourcesBuildPhase; 213 | buildActionMask = 2147483647; 214 | files = ( 215 | 74C725A024A982D40010AC26 /* ring_hello.wav in Resources */, 216 | 74C725A624A982E20010AC26 /* PANN_labels.json in Resources */, 217 | 74C725A824A982ED0010AC26 /* PANN_out.ring_hello.json in Resources */, 218 | ); 219 | runOnlyForDeploymentPostprocessing = 0; 220 | }; 221 | /* End PBXResourcesBuildPhase section */ 222 | 223 | /* Begin PBXSourcesBuildPhase section */ 224 | 741B2D9B2496E94800020939 /* Sources */ = { 225 | isa = PBXSourcesBuildPhase; 226 | buildActionMask = 2147483647; 227 | files = ( 228 | 741B2DA72496E94800020939 /* ViewController.swift in Sources */, 229 | 741B2DCC2496EC2700020939 /* DrawSpecView.swift in Sources */, 230 | 741B2DA32496E94800020939 /* AppDelegate.swift in Sources */, 231 | 74C725A324A982E20010AC26 /* PANN.mlmodel in Sources */, 232 | 741B2DA52496E94800020939 /* SceneDelegate.swift in Sources */, 233 | 741B2DCE2496ED5100020939 /* ConvertSpectrogram.swift in Sources */, 234 | ); 235 | runOnlyForDeploymentPostprocessing = 0; 236 | }; 237 | 741B2DB12496E94B00020939 /* Sources */ = { 238 | isa = PBXSourcesBuildPhase; 239 | buildActionMask = 2147483647; 240 | files = ( 241 | 74C725A424A982E20010AC26 /* PANN.mlmodel in Sources */, 242 | 741B2DBA2496E94B00020939 /* Pytorch_CoreML_Sound_ClassificationTests.swift in Sources */, 243 | 741B2DCF2496ED5100020939 /* ConvertSpectrogram.swift in Sources */, 244 | ); 245 | runOnlyForDeploymentPostprocessing = 0; 246 | }; 247 | /* End PBXSourcesBuildPhase section */ 248 | 249 | /* Begin PBXTargetDependency section */ 250 | 741B2DB72496E94B00020939 /* PBXTargetDependency */ = { 251 | isa = PBXTargetDependency; 252 | target = 741B2D9E2496E94800020939 /* Pytorch-CoreML-Sound-Classification */; 253 | targetProxy = 741B2DB62496E94B00020939 /* PBXContainerItemProxy */; 254 | }; 255 | /* End PBXTargetDependency section */ 256 | 257 | /* Begin PBXVariantGroup section */ 258 | 741B2DA82496E94800020939 /* Main.storyboard */ = { 259 | isa = PBXVariantGroup; 260 | children = ( 261 | 741B2DA92496E94800020939 /* Base */, 262 | ); 263 | name = Main.storyboard; 264 | sourceTree = ""; 265 | }; 266 | 741B2DAD2496E94B00020939 /* LaunchScreen.storyboard */ = { 267 | isa = PBXVariantGroup; 268 | children = ( 269 | 741B2DAE2496E94B00020939 /* Base */, 270 | ); 271 | name = LaunchScreen.storyboard; 272 | sourceTree = ""; 273 | }; 274 | /* End PBXVariantGroup section */ 275 | 276 | /* Begin XCBuildConfiguration section */ 277 | 741B2DBC2496E94B00020939 /* Debug */ = { 278 | isa = XCBuildConfiguration; 279 | buildSettings = { 280 | ALWAYS_SEARCH_USER_PATHS = NO; 281 | CLANG_ANALYZER_NONNULL = YES; 282 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 283 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; 284 | CLANG_CXX_LIBRARY = "libc++"; 285 | CLANG_ENABLE_MODULES = YES; 286 | CLANG_ENABLE_OBJC_ARC = YES; 287 | CLANG_ENABLE_OBJC_WEAK = YES; 288 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 289 | CLANG_WARN_BOOL_CONVERSION = YES; 290 | CLANG_WARN_COMMA = YES; 291 | CLANG_WARN_CONSTANT_CONVERSION = YES; 292 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 293 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 294 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 295 | CLANG_WARN_EMPTY_BODY = YES; 296 | CLANG_WARN_ENUM_CONVERSION = YES; 297 | CLANG_WARN_INFINITE_RECURSION = YES; 298 | CLANG_WARN_INT_CONVERSION = YES; 299 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 300 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 301 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 302 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 303 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 304 | CLANG_WARN_STRICT_PROTOTYPES = YES; 305 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 306 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 307 | CLANG_WARN_UNREACHABLE_CODE = YES; 308 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 309 | COPY_PHASE_STRIP = NO; 310 | DEBUG_INFORMATION_FORMAT = dwarf; 311 | ENABLE_STRICT_OBJC_MSGSEND = YES; 312 | ENABLE_TESTABILITY = YES; 313 | GCC_C_LANGUAGE_STANDARD = gnu11; 314 | GCC_DYNAMIC_NO_PIC = NO; 315 | GCC_NO_COMMON_BLOCKS = YES; 316 | GCC_OPTIMIZATION_LEVEL = 0; 317 | GCC_PREPROCESSOR_DEFINITIONS = ( 318 | "DEBUG=1", 319 | "$(inherited)", 320 | ); 321 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 322 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 323 | GCC_WARN_UNDECLARED_SELECTOR = YES; 324 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 325 | GCC_WARN_UNUSED_FUNCTION = YES; 326 | GCC_WARN_UNUSED_VARIABLE = YES; 327 | IPHONEOS_DEPLOYMENT_TARGET = 13.4; 328 | MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; 329 | MTL_FAST_MATH = YES; 330 | ONLY_ACTIVE_ARCH = YES; 331 | SDKROOT = iphoneos; 332 | SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; 333 | SWIFT_OPTIMIZATION_LEVEL = "-Onone"; 334 | }; 335 | name = Debug; 336 | }; 337 | 741B2DBD2496E94B00020939 /* Release */ = { 338 | isa = XCBuildConfiguration; 339 | buildSettings = { 340 | ALWAYS_SEARCH_USER_PATHS = NO; 341 | CLANG_ANALYZER_NONNULL = YES; 342 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 343 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; 344 | CLANG_CXX_LIBRARY = "libc++"; 345 | CLANG_ENABLE_MODULES = YES; 346 | CLANG_ENABLE_OBJC_ARC = YES; 347 | CLANG_ENABLE_OBJC_WEAK = YES; 348 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 349 | CLANG_WARN_BOOL_CONVERSION = YES; 350 | CLANG_WARN_COMMA = YES; 351 | CLANG_WARN_CONSTANT_CONVERSION = YES; 352 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 353 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 354 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 355 | CLANG_WARN_EMPTY_BODY = YES; 356 | CLANG_WARN_ENUM_CONVERSION = YES; 357 | CLANG_WARN_INFINITE_RECURSION = YES; 358 | CLANG_WARN_INT_CONVERSION = YES; 359 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 360 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 361 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 362 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 363 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 364 | CLANG_WARN_STRICT_PROTOTYPES = YES; 365 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 366 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 367 | CLANG_WARN_UNREACHABLE_CODE = YES; 368 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 369 | COPY_PHASE_STRIP = NO; 370 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 371 | ENABLE_NS_ASSERTIONS = NO; 372 | ENABLE_STRICT_OBJC_MSGSEND = YES; 373 | GCC_C_LANGUAGE_STANDARD = gnu11; 374 | GCC_NO_COMMON_BLOCKS = YES; 375 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 376 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 377 | GCC_WARN_UNDECLARED_SELECTOR = YES; 378 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 379 | GCC_WARN_UNUSED_FUNCTION = YES; 380 | GCC_WARN_UNUSED_VARIABLE = YES; 381 | IPHONEOS_DEPLOYMENT_TARGET = 13.4; 382 | MTL_ENABLE_DEBUG_INFO = NO; 383 | MTL_FAST_MATH = YES; 384 | SDKROOT = iphoneos; 385 | SWIFT_COMPILATION_MODE = wholemodule; 386 | SWIFT_OPTIMIZATION_LEVEL = "-O"; 387 | VALIDATE_PRODUCT = YES; 388 | }; 389 | name = Release; 390 | }; 391 | 741B2DBF2496E94B00020939 /* Debug */ = { 392 | isa = XCBuildConfiguration; 393 | buildSettings = { 394 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 395 | CODE_SIGN_STYLE = Automatic; 396 | DEVELOPMENT_TEAM = K6KE3UBBT4; 397 | INFOPLIST_FILE = "Pytorch-CoreML-Sound-Classification/Info.plist"; 398 | IPHONEOS_DEPLOYMENT_TARGET = 13.0; 399 | LD_RUNPATH_SEARCH_PATHS = ( 400 | "$(inherited)", 401 | "@executable_path/Frameworks", 402 | ); 403 | PRODUCT_BUNDLE_IDENTIFIER = "my.testco.Pytorch-CoreML-Sound-Classification"; 404 | PRODUCT_NAME = "$(TARGET_NAME)"; 405 | SWIFT_VERSION = 5.0; 406 | TARGETED_DEVICE_FAMILY = "1,2"; 407 | }; 408 | name = Debug; 409 | }; 410 | 741B2DC02496E94B00020939 /* Release */ = { 411 | isa = XCBuildConfiguration; 412 | buildSettings = { 413 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 414 | CODE_SIGN_STYLE = Automatic; 415 | DEVELOPMENT_TEAM = K6KE3UBBT4; 416 | INFOPLIST_FILE = "Pytorch-CoreML-Sound-Classification/Info.plist"; 417 | IPHONEOS_DEPLOYMENT_TARGET = 13.0; 418 | LD_RUNPATH_SEARCH_PATHS = ( 419 | "$(inherited)", 420 | "@executable_path/Frameworks", 421 | ); 422 | PRODUCT_BUNDLE_IDENTIFIER = "my.testco.Pytorch-CoreML-Sound-Classification"; 423 | PRODUCT_NAME = "$(TARGET_NAME)"; 424 | SWIFT_VERSION = 5.0; 425 | TARGETED_DEVICE_FAMILY = "1,2"; 426 | }; 427 | name = Release; 428 | }; 429 | 741B2DC22496E94B00020939 /* Debug */ = { 430 | isa = XCBuildConfiguration; 431 | buildSettings = { 432 | ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; 433 | BUNDLE_LOADER = "$(TEST_HOST)"; 434 | CODE_SIGN_STYLE = Automatic; 435 | DEVELOPMENT_TEAM = K6KE3UBBT4; 436 | INFOPLIST_FILE = "Pytorch-CoreML-Sound-ClassificationTests/Info.plist"; 437 | IPHONEOS_DEPLOYMENT_TARGET = 13.4; 438 | LD_RUNPATH_SEARCH_PATHS = ( 439 | "$(inherited)", 440 | "@executable_path/Frameworks", 441 | "@loader_path/Frameworks", 442 | ); 443 | PRODUCT_BUNDLE_IDENTIFIER = "my.testco.Pytorch-CoreML-Sound-ClassificationTests"; 444 | PRODUCT_NAME = "$(TARGET_NAME)"; 445 | SWIFT_VERSION = 5.0; 446 | TARGETED_DEVICE_FAMILY = "1,2"; 447 | TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Pytorch-CoreML-Sound-Classification.app/Pytorch-CoreML-Sound-Classification"; 448 | }; 449 | name = Debug; 450 | }; 451 | 741B2DC32496E94B00020939 /* Release */ = { 452 | isa = XCBuildConfiguration; 453 | buildSettings = { 454 | ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; 455 | BUNDLE_LOADER = "$(TEST_HOST)"; 456 | CODE_SIGN_STYLE = Automatic; 457 | DEVELOPMENT_TEAM = K6KE3UBBT4; 458 | INFOPLIST_FILE = "Pytorch-CoreML-Sound-ClassificationTests/Info.plist"; 459 | IPHONEOS_DEPLOYMENT_TARGET = 13.4; 460 | LD_RUNPATH_SEARCH_PATHS = ( 461 | "$(inherited)", 462 | "@executable_path/Frameworks", 463 | "@loader_path/Frameworks", 464 | ); 465 | PRODUCT_BUNDLE_IDENTIFIER = "my.testco.Pytorch-CoreML-Sound-ClassificationTests"; 466 | PRODUCT_NAME = "$(TARGET_NAME)"; 467 | SWIFT_VERSION = 5.0; 468 | TARGETED_DEVICE_FAMILY = "1,2"; 469 | TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Pytorch-CoreML-Sound-Classification.app/Pytorch-CoreML-Sound-Classification"; 470 | }; 471 | name = Release; 472 | }; 473 | /* End XCBuildConfiguration section */ 474 | 475 | /* Begin XCConfigurationList section */ 476 | 741B2D9A2496E94800020939 /* Build configuration list for PBXProject "Pytorch-CoreML-Sound-Classification" */ = { 477 | isa = XCConfigurationList; 478 | buildConfigurations = ( 479 | 741B2DBC2496E94B00020939 /* Debug */, 480 | 741B2DBD2496E94B00020939 /* Release */, 481 | ); 482 | defaultConfigurationIsVisible = 0; 483 | defaultConfigurationName = Release; 484 | }; 485 | 741B2DBE2496E94B00020939 /* Build configuration list for PBXNativeTarget "Pytorch-CoreML-Sound-Classification" */ = { 486 | isa = XCConfigurationList; 487 | buildConfigurations = ( 488 | 741B2DBF2496E94B00020939 /* Debug */, 489 | 741B2DC02496E94B00020939 /* Release */, 490 | ); 491 | defaultConfigurationIsVisible = 0; 492 | defaultConfigurationName = Release; 493 | }; 494 | 741B2DC12496E94B00020939 /* Build configuration list for PBXNativeTarget "Pytorch-CoreML-Sound-ClassificationTests" */ = { 495 | isa = XCConfigurationList; 496 | buildConfigurations = ( 497 | 741B2DC22496E94B00020939 /* Debug */, 498 | 741B2DC32496E94B00020939 /* Release */, 499 | ); 500 | defaultConfigurationIsVisible = 0; 501 | defaultConfigurationName = Release; 502 | }; 503 | /* End XCConfigurationList section */ 504 | }; 505 | rootObject = 741B2D972496E94800020939 /* Project object */; 506 | } 507 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDEDidComputeMac32BitWarning 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification.xcodeproj/xcshareddata/xcschemes/Pytorch-CoreML-Sound-Classification.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 33 | 39 | 40 | 41 | 42 | 43 | 53 | 55 | 61 | 62 | 63 | 64 | 70 | 72 | 78 | 79 | 80 | 81 | 83 | 84 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification.xcodeproj/xcuserdata/gerald.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 9 | 17 | 18 | 19 | 20 | 21 | 23 | 31 | 32 | 33 | 34 | 35 | 37 | 45 | 46 | 47 | 48 | 49 | 51 | 59 | 60 | 61 | 62 | 63 | 65 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification.xcodeproj/xcuserdata/gerald.xcuserdatad/xcschemes/xcschememanagement.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SchemeUserState 6 | 7 | Pytorch-CoreML-Sound-Classification.xcscheme_^#shared#^_ 8 | 9 | orderHint 10 | 0 11 | 12 | 13 | SuppressBuildableAutocreation 14 | 15 | 741B2D9E2496E94800020939 16 | 17 | primary 18 | 19 | 20 | 741B2DB42496E94B00020939 21 | 22 | primary 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/AppDelegate.swift: -------------------------------------------------------------------------------- 1 | // 2 | // AppDelegate.swift 3 | // Pytorch-CoreML-Sound-Classification 4 | // 5 | // Created by Gerald on 6/14/20. 6 | // Copyright © 2020 Gerald. All rights reserved. 7 | // 8 | 9 | import UIKit 10 | 11 | @UIApplicationMain 12 | class AppDelegate: UIResponder, UIApplicationDelegate { 13 | 14 | 15 | 16 | func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { 17 | // Override point for customization after application launch. 18 | return true 19 | } 20 | 21 | // MARK: UISceneSession Lifecycle 22 | 23 | func application(_ application: UIApplication, configurationForConnecting connectingSceneSession: UISceneSession, options: UIScene.ConnectionOptions) -> UISceneConfiguration { 24 | // Called when a new scene session is being created. 25 | // Use this method to select a configuration to create the new scene with. 26 | return UISceneConfiguration(name: "Default Configuration", sessionRole: connectingSceneSession.role) 27 | } 28 | 29 | func application(_ application: UIApplication, didDiscardSceneSessions sceneSessions: Set) { 30 | // Called when the user discards a scene session. 31 | // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions. 32 | // Use this method to release any resources that were specific to the discarded scenes, as they will not return. 33 | } 34 | 35 | 36 | } 37 | 38 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "idiom" : "iphone", 5 | "scale" : "2x", 6 | "size" : "20x20" 7 | }, 8 | { 9 | "idiom" : "iphone", 10 | "scale" : "3x", 11 | "size" : "20x20" 12 | }, 13 | { 14 | "idiom" : "iphone", 15 | "scale" : "2x", 16 | "size" : "29x29" 17 | }, 18 | { 19 | "idiom" : "iphone", 20 | "scale" : "3x", 21 | "size" : "29x29" 22 | }, 23 | { 24 | "idiom" : "iphone", 25 | "scale" : "2x", 26 | "size" : "40x40" 27 | }, 28 | { 29 | "idiom" : "iphone", 30 | "scale" : "3x", 31 | "size" : "40x40" 32 | }, 33 | { 34 | "idiom" : "iphone", 35 | "scale" : "2x", 36 | "size" : "60x60" 37 | }, 38 | { 39 | "idiom" : "iphone", 40 | "scale" : "3x", 41 | "size" : "60x60" 42 | }, 43 | { 44 | "idiom" : "ipad", 45 | "scale" : "1x", 46 | "size" : "20x20" 47 | }, 48 | { 49 | "idiom" : "ipad", 50 | "scale" : "2x", 51 | "size" : "20x20" 52 | }, 53 | { 54 | "idiom" : "ipad", 55 | "scale" : "1x", 56 | "size" : "29x29" 57 | }, 58 | { 59 | "idiom" : "ipad", 60 | "scale" : "2x", 61 | "size" : "29x29" 62 | }, 63 | { 64 | "idiom" : "ipad", 65 | "scale" : "1x", 66 | "size" : "40x40" 67 | }, 68 | { 69 | "idiom" : "ipad", 70 | "scale" : "2x", 71 | "size" : "40x40" 72 | }, 73 | { 74 | "idiom" : "ipad", 75 | "scale" : "1x", 76 | "size" : "76x76" 77 | }, 78 | { 79 | "idiom" : "ipad", 80 | "scale" : "2x", 81 | "size" : "76x76" 82 | }, 83 | { 84 | "idiom" : "ipad", 85 | "scale" : "2x", 86 | "size" : "83.5x83.5" 87 | }, 88 | { 89 | "idiom" : "ios-marketing", 90 | "scale" : "1x", 91 | "size" : "1024x1024" 92 | } 93 | ], 94 | "info" : { 95 | "author" : "xcode", 96 | "version" : 1 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/Base.lproj/LaunchScreen.storyboard: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/Base.lproj/Main.storyboard: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 49 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/ConvertSpectrogram.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ConvertSpectrogram.swift 3 | // CoreML_Audio_Analysis 4 | // 5 | // Created by Gerald on 5/30/20. 6 | // Copyright © 2020 Gerald. All rights reserved. 7 | // 8 | // from https://github.com/tucan9389/DepthPrediction-CoreML/blob/master/DepthPrediction-CoreML/heatmapProcessor.swift 9 | import CoreML 10 | 11 | class SpectrogramConverter { 12 | func convertTo2DArray(from spectrogram: MLMultiArray) -> Array> { 13 | guard spectrogram.shape.count == 4 else { 14 | print("spectrogram's shape is invalid. \(spectrogram.shape)") 15 | return [] 16 | } 17 | let spectrogram_w = spectrogram.shape[2].intValue 18 | let spectrogram_h = spectrogram.shape[3].intValue 19 | 20 | var converted_spectrogram: Array> = Array(repeating: Array(repeating: 0.0, count: spectrogram_h), count: spectrogram_w) 21 | 22 | var minimumValue: Float32 = Float32.greatestFiniteMagnitude 23 | var maximumValue: Float32 = -Float32.greatestFiniteMagnitude 24 | 25 | for i in 0.. 0 else { continue } 30 | converted_spectrogram[i][spectrogram_h-j-1] = val // origin at bottom 31 | 32 | if minimumValue > val { 33 | minimumValue = val 34 | } 35 | if maximumValue < val { 36 | maximumValue = val 37 | } 38 | } 39 | } 40 | 41 | maximumValue = max( -15.0, maximumValue ) // for improved contrast on device 42 | var minmaxGap = maximumValue - minimumValue 43 | 44 | // print( "minmax \(minmaxGap) \(maximumValue) \(minimumValue)") 45 | 46 | if ( minmaxGap == 0 ) { 47 | minmaxGap = 1.0 48 | } 49 | for i in 0..>? = nil { 17 | didSet { 18 | DispatchQueue.main.async { 19 | self.setNeedsDisplay() 20 | } 21 | } 22 | } 23 | 24 | override func draw(_ rect: CGRect) { 25 | 26 | if let ctx = UIGraphicsGetCurrentContext() { 27 | 28 | ctx.clear(rect); 29 | 30 | guard let spectrogram = self.spectrogram else { return } 31 | 32 | let size = self.bounds.size 33 | let spectrogram_w = spectrogram.count 34 | let spectrogram_h = spectrogram.first?.count ?? 0 35 | let w = size.width / CGFloat(spectrogram_w) 36 | let h = size.height / CGFloat(spectrogram_h) 37 | 38 | for j in 0.. 1 { 43 | alpha = 1 44 | } else if alpha < 0 { 45 | alpha = 0 46 | } 47 | 48 | let rect: CGRect = CGRect(x: CGFloat(i) * w, y: CGFloat(j) * h, width: w, height: h) 49 | 50 | // color 51 | let hue: CGFloat = (1.0-alpha) * (240.0 / 360.0) 52 | let color: UIColor = UIColor(hue: hue, saturation: 1, brightness: 1, alpha: 0.94) 53 | 54 | // gray 55 | // let color: UIColor = UIColor(white: 1-alpha, alpha: 1) 56 | 57 | let bpath: UIBezierPath = UIBezierPath(rect: rect) 58 | 59 | color.set() 60 | bpath.fill() 61 | } 62 | } 63 | } 64 | } // end of draw(rect:) 65 | 66 | } 67 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | $(DEVELOPMENT_LANGUAGE) 7 | CFBundleExecutable 8 | $(EXECUTABLE_NAME) 9 | CFBundleIdentifier 10 | $(PRODUCT_BUNDLE_IDENTIFIER) 11 | CFBundleInfoDictionaryVersion 12 | 6.0 13 | CFBundleName 14 | $(PRODUCT_NAME) 15 | CFBundlePackageType 16 | $(PRODUCT_BUNDLE_PACKAGE_TYPE) 17 | CFBundleShortVersionString 18 | 1.0 19 | CFBundleVersion 20 | 1 21 | LSRequiresIPhoneOS 22 | 23 | NSMicrophoneUsageDescription 24 | Capture audio. 25 | UIApplicationSceneManifest 26 | 27 | UIApplicationSupportsMultipleScenes 28 | 29 | UISceneConfigurations 30 | 31 | UIWindowSceneSessionRoleApplication 32 | 33 | 34 | UISceneConfigurationName 35 | Default Configuration 36 | UISceneDelegateClassName 37 | $(PRODUCT_MODULE_NAME).SceneDelegate 38 | UISceneStoryboardFile 39 | Main 40 | 41 | 42 | 43 | 44 | UILaunchStoryboardName 45 | LaunchScreen 46 | UIMainStoryboardFile 47 | Main 48 | UIRequiredDeviceCapabilities 49 | 50 | armv7 51 | 52 | UISupportedInterfaceOrientations 53 | 54 | UIInterfaceOrientationPortrait 55 | 56 | UISupportedInterfaceOrientations~ipad 57 | 58 | UIInterfaceOrientationPortrait 59 | UIInterfaceOrientationPortraitUpsideDown 60 | UIInterfaceOrientationLandscapeLeft 61 | UIInterfaceOrientationLandscapeRight 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/PANN.mlmodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml-illustrated/Pytorch-CoreML-Sound-Classification/9c5b268dbc3926494e5c1d6611da316dbefe770a/Pytorch-CoreML-Sound-Classification/PANN.mlmodel -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/PANN_labels.json: -------------------------------------------------------------------------------- 1 | ["Speech", "Male speech, man speaking", "Female speech, woman speaking", "Child speech, kid speaking", "Conversation", "Narration, monologue", "Babbling", "Speech synthesizer", "Shout", "Bellow", "Whoop", "Yell", "Battle cry", "Children shouting", "Screaming", "Whispering", "Laughter", "Baby laughter", "Giggle", "Snicker", "Belly laugh", "Chuckle, chortle", "Crying, sobbing", "Baby cry, infant cry", "Whimper", "Wail, moan", "Sigh", "Singing", "Choir", "Yodeling", "Chant", "Mantra", "Male singing", "Female singing", "Child singing", "Synthetic singing", "Rapping", "Humming", "Groan", "Grunt", "Whistling", "Breathing", "Wheeze", "Snoring", "Gasp", "Pant", "Snort", "Cough", "Throat clearing", "Sneeze", "Sniff", "Run", "Shuffle", "Walk, footsteps", "Chewing, mastication", "Biting", "Gargling", "Stomach rumble", "Burping, eructation", "Hiccup", "Fart", "Hands", "Finger snapping", "Clapping", "Heart sounds, heartbeat", "Heart murmur", "Cheering", "Applause", "Chatter", "Crowd", "Hubbub, speech noise, speech babble", "Children playing", "Animal", "Domestic animals, pets", "Dog", "Bark", "Yip", "Howl", "Bow-wow", "Growling", "Whimper (dog)", "Cat", "Purr", "Meow", "Hiss", "Caterwaul", "Livestock, farm animals, working animals", "Horse", "Clip-clop", "Neigh, whinny", "Cattle, bovinae", "Moo", "Cowbell", "Pig", "Oink", "Goat", "Bleat", "Sheep", "Fowl", "Chicken, rooster", "Cluck", "Crowing, cock-a-doodle-doo", "Turkey", "Gobble", "Duck", "Quack", "Goose", "Honk", "Wild animals", "Roaring cats (lions, tigers)", "Roar", "Bird", "Bird vocalization, bird call, bird song", "Chirp, tweet", "Squawk", "Pigeon, dove", "Coo", "Crow", "Caw", "Owl", "Hoot", "Bird flight, flapping wings", "Canidae, dogs, wolves", "Rodents, rats, mice", "Mouse", "Patter", "Insect", "Cricket", "Mosquito", "Fly, housefly", "Buzz", "Bee, wasp, etc.", "Frog", "Croak", "Snake", "Rattle", "Whale vocalization", "Music", "Musical instrument", "Plucked string instrument", "Guitar", "Electric guitar", "Bass guitar", "Acoustic guitar", "Steel guitar, slide guitar", "Tapping (guitar technique)", "Strum", "Banjo", "Sitar", "Mandolin", "Zither", "Ukulele", "Keyboard (musical)", "Piano", "Electric piano", "Organ", "Electronic organ", "Hammond organ", "Synthesizer", "Sampler", "Harpsichord", "Percussion", "Drum kit", "Drum machine", "Drum", "Snare drum", "Rimshot", "Drum roll", "Bass drum", "Timpani", "Tabla", "Cymbal", "Hi-hat", "Wood block", "Tambourine", "Rattle (instrument)", "Maraca", "Gong", "Tubular bells", "Mallet percussion", "Marimba, xylophone", "Glockenspiel", "Vibraphone", "Steelpan", "Orchestra", "Brass instrument", "French horn", "Trumpet", "Trombone", "Bowed string instrument", "String section", "Violin, fiddle", "Pizzicato", "Cello", "Double bass", "Wind instrument, woodwind instrument", "Flute", "Saxophone", "Clarinet", "Harp", "Bell", "Church bell", "Jingle bell", "Bicycle bell", "Tuning fork", "Chime", "Wind chime", "Change ringing (campanology)", "Harmonica", "Accordion", "Bagpipes", "Didgeridoo", "Shofar", "Theremin", "Singing bowl", "Scratching (performance technique)", "Pop music", "Hip hop music", "Beatboxing", "Rock music", "Heavy metal", "Punk rock", "Grunge", "Progressive rock", "Rock and roll", "Psychedelic rock", "Rhythm and blues", "Soul music", "Reggae", "Country", "Swing music", "Bluegrass", "Funk", "Folk music", "Middle Eastern music", "Jazz", "Disco", "Classical music", "Opera", "Electronic music", "House music", "Techno", "Dubstep", "Drum and bass", "Electronica", "Electronic dance music", "Ambient music", "Trance music", "Music of Latin America", "Salsa music", "Flamenco", "Blues", "Music for children", "New-age music", "Vocal music", "A capella", "Music of Africa", "Afrobeat", "Christian music", "Gospel music", "Music of Asia", "Carnatic music", "Music of Bollywood", "Ska", "Traditional music", "Independent music", "Song", "Background music", "Theme music", "Jingle (music)", "Soundtrack music", "Lullaby", "Video game music", "Christmas music", "Dance music", "Wedding music", "Happy music", "Funny music", "Sad music", "Tender music", "Exciting music", "Angry music", "Scary music", "Wind", "Rustling leaves", "Wind noise (microphone)", "Thunderstorm", "Thunder", "Water", "Rain", "Raindrop", "Rain on surface", "Stream", "Waterfall", "Ocean", "Waves, surf", "Steam", "Gurgling", "Fire", "Crackle", "Vehicle", "Boat, Water vehicle", "Sailboat, sailing ship", "Rowboat, canoe, kayak", "Motorboat, speedboat", "Ship", "Motor vehicle (road)", "Car", "Vehicle horn, car horn, honking", "Toot", "Car alarm", "Power windows, electric windows", "Skidding", "Tire squeal", "Car passing by", "Race car, auto racing", "Truck", "Air brake", "Air horn, truck horn", "Reversing beeps", "Ice cream truck, ice cream van", "Bus", "Emergency vehicle", "Police car (siren)", "Ambulance (siren)", "Fire engine, fire truck (siren)", "Motorcycle", "Traffic noise, roadway noise", "Rail transport", "Train", "Train whistle", "Train horn", "Railroad car, train wagon", "Train wheels squealing", "Subway, metro, underground", "Aircraft", "Aircraft engine", "Jet engine", "Propeller, airscrew", "Helicopter", "Fixed-wing aircraft, airplane", "Bicycle", "Skateboard", "Engine", "Light engine (high frequency)", "Dental drill, dentist's drill", "Lawn mower", "Chainsaw", "Medium engine (mid frequency)", "Heavy engine (low frequency)", "Engine knocking", "Engine starting", "Idling", "Accelerating, revving, vroom", "Door", "Doorbell", "Ding-dong", "Sliding door", "Slam", "Knock", "Tap", "Squeak", "Cupboard open or close", "Drawer open or close", "Dishes, pots, and pans", "Cutlery, silverware", "Chopping (food)", "Frying (food)", "Microwave oven", "Blender", "Water tap, faucet", "Sink (filling or washing)", "Bathtub (filling or washing)", "Hair dryer", "Toilet flush", "Toothbrush", "Electric toothbrush", "Vacuum cleaner", "Zipper (clothing)", "Keys jangling", "Coin (dropping)", "Scissors", "Electric shaver, electric razor", "Shuffling cards", "Typing", "Typewriter", "Computer keyboard", "Writing", "Alarm", "Telephone", "Telephone bell ringing", "Ringtone", "Telephone dialing, DTMF", "Dial tone", "Busy signal", "Alarm clock", "Siren", "Civil defense siren", "Buzzer", "Smoke detector, smoke alarm", "Fire alarm", "Foghorn", "Whistle", "Steam whistle", "Mechanisms", "Ratchet, pawl", "Clock", "Tick", "Tick-tock", "Gears", "Pulleys", "Sewing machine", "Mechanical fan", "Air conditioning", "Cash register", "Printer", "Camera", "Single-lens reflex camera", "Tools", "Hammer", "Jackhammer", "Sawing", "Filing (rasp)", "Sanding", "Power tool", "Drill", "Explosion", "Gunshot, gunfire", "Machine gun", "Fusillade", "Artillery fire", "Cap gun", "Fireworks", "Firecracker", "Burst, pop", "Eruption", "Boom", "Wood", "Chop", "Splinter", "Crack", "Glass", "Chink, clink", "Shatter", "Liquid", "Splash, splatter", "Slosh", "Squish", "Drip", "Pour", "Trickle, dribble", "Gush", "Fill (with liquid)", "Spray", "Pump (liquid)", "Stir", "Boiling", "Sonar", "Arrow", "Whoosh, swoosh, swish", "Thump, thud", "Thunk", "Electronic tuner", "Effects unit", "Chorus effect", "Basketball bounce", "Bang", "Slap, smack", "Whack, thwack", "Smash, crash", "Breaking", "Bouncing", "Whip", "Flap", "Scratch", "Scrape", "Rub", "Roll", "Crushing", "Crumpling, crinkling", "Tearing", "Beep, bleep", "Ping", "Ding", "Clang", "Squeal", "Creak", "Rustle", "Whir", "Clatter", "Sizzle", "Clicking", "Clickety-clack", "Rumble", "Plop", "Jingle, tinkle", "Hum", "Zing", "Boing", "Crunch", "Silence", "Sine wave", "Harmonic", "Chirp tone", "Sound effect", "Pulse", "Inside, small room", "Inside, large room or hall", "Inside, public space", "Outside, urban or manmade", "Outside, rural or natural", "Reverberation", "Echo", "Noise", "Environmental noise", "Static", "Mains hum", "Distortion", "Sidetone", "Cacophony", "White noise", "Pink noise", "Throbbing", "Vibration", "Television", "Radio", "Field recording"] -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/SceneDelegate.swift: -------------------------------------------------------------------------------- 1 | // 2 | // SceneDelegate.swift 3 | // Pytorch-CoreML-Sound-Classification 4 | // 5 | // Created by Gerald on 6/14/20. 6 | // Copyright © 2020 Gerald. All rights reserved. 7 | // 8 | 9 | import UIKit 10 | 11 | class SceneDelegate: UIResponder, UIWindowSceneDelegate { 12 | 13 | var window: UIWindow? 14 | 15 | 16 | func scene(_ scene: UIScene, willConnectTo session: UISceneSession, options connectionOptions: UIScene.ConnectionOptions) { 17 | // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`. 18 | // If using a storyboard, the `window` property will automatically be initialized and attached to the scene. 19 | // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead). 20 | guard let _ = (scene as? UIWindowScene) else { return } 21 | } 22 | 23 | func sceneDidDisconnect(_ scene: UIScene) { 24 | // Called as the scene is being released by the system. 25 | // This occurs shortly after the scene enters the background, or when its session is discarded. 26 | // Release any resources associated with this scene that can be re-created the next time the scene connects. 27 | // The scene may re-connect later, as its session was not neccessarily discarded (see `application:didDiscardSceneSessions` instead). 28 | } 29 | 30 | func sceneDidBecomeActive(_ scene: UIScene) { 31 | // Called when the scene has moved from an inactive state to an active state. 32 | // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive. 33 | } 34 | 35 | func sceneWillResignActive(_ scene: UIScene) { 36 | // Called when the scene will move from an active state to an inactive state. 37 | // This may occur due to temporary interruptions (ex. an incoming phone call). 38 | } 39 | 40 | func sceneWillEnterForeground(_ scene: UIScene) { 41 | // Called as the scene transitions from the background to the foreground. 42 | // Use this method to undo the changes made on entering the background. 43 | } 44 | 45 | func sceneDidEnterBackground(_ scene: UIScene) { 46 | // Called as the scene transitions from the foreground to the background. 47 | // Use this method to save data, release shared resources, and store enough scene-specific state information 48 | // to restore the scene back to its current state. 49 | } 50 | 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-Classification/ViewController.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ViewController.swift 3 | // Pytorch-CoreML-Sound-Classification 4 | // 5 | // Created by Gerald on 6/14/20. 6 | // Copyright © 2020 Gerald. All rights reserved. 7 | // 8 | 9 | import UIKit 10 | import AVKit 11 | import CoreML 12 | 13 | class ViewController: UIViewController { 14 | 15 | @IBOutlet weak var drawSpecView: DrawSpecView! 16 | 17 | @IBOutlet weak var labelsTableView: UITableView! 18 | 19 | // set up for audio 20 | private let audioEngine = AVAudioEngine() 21 | // specify the audio samples format the CoreML model 22 | private let desiredAudioFormat: AVAudioFormat = { 23 | let avAudioChannelLayout = AVAudioChannelLayout(layoutTag: kAudioChannelLayoutTag_Mono)! 24 | return AVAudioFormat( 25 | commonFormat: .pcmFormatFloat32, 26 | sampleRate: Double( 32000 ), // as specified when creating the Pytorch model 27 | interleaved: true, 28 | channelLayout: avAudioChannelLayout 29 | ) 30 | }() 31 | 32 | // create a queue to do analysis on a separate thread 33 | private let analysisQueue = DispatchQueue(label: "com.myco.AnalysisQueue") 34 | 35 | // instantiate our model 36 | 37 | 38 | var model : PANN? = nil 39 | typealias NetworkInput = PANNInput 40 | typealias NetworkOutput = PANNOutput 41 | var class_labels: NSArray? 42 | 43 | typealias OutputClass = ( String, Float32, Int ) 44 | private var tableData: [OutputClass?] = [] 45 | 46 | 47 | // semaphore to protect the CoreML model 48 | let semaphore = DispatchSemaphore(value: 1) 49 | 50 | // for rendering our spectrogram 51 | let spec_converter = SpectrogramConverter() 52 | 53 | override func viewDidLoad() { 54 | super.viewDidLoad() 55 | // Do any additional setup after loading the view. 56 | load_model() 57 | 58 | // setup tableview datasource on bottom 59 | labelsTableView.dataSource = self 60 | 61 | } 62 | 63 | override func viewDidAppear(_ animated: Bool) { 64 | startAudioEngine() 65 | } 66 | 67 | private func load_model() { 68 | let config = MLModelConfiguration() 69 | config.computeUnits = .all 70 | do { 71 | self.model = try PANN( configuration: config ) 72 | } catch { 73 | fatalError( "unable to load ML model!" ) 74 | } 75 | 76 | guard let path = Bundle.main.path(forResource:"PANN_labels", ofType: "json") else { 77 | return 78 | } 79 | 80 | 81 | if let JSONData = try? Data(contentsOf: URL(fileURLWithPath: path)) { 82 | self.class_labels = try! JSONSerialization.jsonObject(with: JSONData, options: .mutableContainers) as? NSArray 83 | } 84 | 85 | } 86 | 87 | // audio capture via microphone 88 | private func startAudioEngine() { 89 | 90 | // https://stackoverflow.com/questions/48831411/converting-avaudiopcmbuffer-to-another-avaudiopcmbuffer 91 | // more info at https://medium.com/@prianka.kariat/changing-the-format-of-ios-avaudioengine-mic-input-c183459cab63 92 | 93 | let inputNode = audioEngine.inputNode 94 | let originalAudioFormat: AVAudioFormat = inputNode.inputFormat(forBus: 0) 95 | // input is in 44.1kHz, 2 channels 96 | 97 | let downSampleRate: Double = desiredAudioFormat.sampleRate 98 | let ratio: Float = Float(originalAudioFormat.sampleRate)/Float(downSampleRate) 99 | 100 | // print( "input sr: \(originalAudioFormat.sampleRate) ch: \(originalAudioFormat.channelCount)" ) 101 | // print( "desired sr: \(desiredAudioFormat.sampleRate) ch: \(desiredAudioFormat.channelCount) ratio \(ratio)" ) 102 | 103 | guard let formatConverter = AVAudioConverter(from:originalAudioFormat, to: desiredAudioFormat) else { 104 | fatalError( "unable to create formatConverter!" ) 105 | } 106 | 107 | // start audio capture by installing a Tap 108 | inputNode.installTap( 109 | onBus: 0, 110 | bufferSize: AVAudioFrameCount(downSampleRate * 2), 111 | format: originalAudioFormat 112 | ) { 113 | (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in 114 | // closure to process the captured audio, buffer size dictated by AudioEngine/device 115 | 116 | let capacity = UInt32(Float(buffer.frameCapacity)/ratio) 117 | 118 | guard let pcmBuffer = AVAudioPCMBuffer( 119 | pcmFormat: self.desiredAudioFormat, 120 | frameCapacity: capacity) else { 121 | print("Failed to create pcm buffer") 122 | return 123 | } 124 | 125 | let inputBlock: AVAudioConverterInputBlock = { inNumPackets, outStatus in 126 | outStatus.pointee = AVAudioConverterInputStatus.haveData 127 | return buffer 128 | } 129 | 130 | // convert input samples into the one our model needs 131 | var error: NSError? 132 | let status: AVAudioConverterOutputStatus = formatConverter.convert( 133 | to: pcmBuffer, 134 | error: &error, 135 | withInputFrom: inputBlock) 136 | 137 | if status == .error { 138 | if let unwrappedError: NSError = error { 139 | print("Error \(unwrappedError)") 140 | } 141 | return 142 | } 143 | 144 | // we now have the audio in mono, 32000 sample rate the CoreML model needs 145 | // convert audio samples into MLMultiArray format for CoreML models 146 | let channelData = pcmBuffer.floatChannelData 147 | let output_samples = Int(pcmBuffer.frameLength) 148 | let channelDataPointer = channelData!.pointee 149 | 150 | //print( "converted from \(buffer.frameLength) to len \(output_samples) val[0] \(channelDataPointer[0]) \(channelDataPointer[output_samples-1])" ) 151 | 152 | let audioData = try! MLMultiArray( shape: [1, output_samples as NSNumber], dataType: .float32 ) 153 | let ptr = UnsafeMutablePointer(OpaquePointer(audioData.dataPointer)) 154 | for i in 0..(OpaquePointer(output_clipwise.dataPointer)) 196 | 197 | let num_classes = self.class_labels!.count 198 | var max_class: Int = -1 199 | var max_class_prob: Float32 = 0.0 200 | for i in 0.. max_class_prob { 203 | max_class_prob = val 204 | max_class = i 205 | } 206 | } 207 | let max_class_label: String = (self.class_labels?[max_class]) as! String 208 | // print( "max: \(max_class) \(max_class_prob) \(max_class_label)" ) 209 | let row = OutputClass( max_class_label, max_class_prob, max_class ) 210 | /* 211 | var max_class_2: Int = -1 212 | var max_class_prob_2: Float32 = 0.0 213 | for i in 0.. max_class_prob_2 { 219 | max_class_prob_2 = val 220 | max_class_2 = i 221 | } 222 | } 223 | let max_class_label_2: String = (self.class_labels?[max_class_2]) as! String 224 | // print( "max: \(max_class) \(max_class_prob) \(max_class_label)" ) 225 | let row_2 = OutputClass( max_class_label_2, max_class_prob_2, max_class_2 ) 226 | */ 227 | let predicted_classes = [ row ] 228 | 229 | DispatchQueue.main.sync { 230 | self.showPredictedClasses(with: predicted_classes) 231 | } 232 | 233 | 234 | let output_spectrogram: MLMultiArray = outputs.melspec 235 | 236 | // melspectrogram is in MLMultiArray in decibels. Convert to 0..1 for visualization 237 | // and then pass the converted spectrogram to the UI element drawSpecView 238 | drawSpecView.spectrogram = spec_converter.convertTo2DArray(from: output_spectrogram) 239 | } else { 240 | self.semaphore.signal() 241 | } 242 | } 243 | func showPredictedClasses(with predicted_classes : [OutputClass] ) { 244 | self.tableData = predicted_classes 245 | // print( "data: \(predicted_classes[0])" ) 246 | self.labelsTableView.reloadData() 247 | } 248 | 249 | } 250 | 251 | 252 | 253 | // MARK: - UITableView Data Source 254 | extension ViewController: UITableViewDataSource { 255 | func tableView(_ tableView: UITableView, numberOfRowsInSection section: Int) -> Int { 256 | return tableData.count// > 0 ? 1 : 0 257 | } 258 | 259 | func tableView(_ tableView: UITableView, cellForRowAt indexPath: IndexPath) -> UITableViewCell { 260 | let cell: UITableViewCell = tableView.dequeueReusableCell(withIdentifier: "LabelCell", for: indexPath) 261 | if let row = tableData[indexPath.row] { 262 | cell.textLabel?.text = row.0 263 | let probText: String = "\(String(format: "%.1f%%", row.1*100))" 264 | cell.detailTextLabel?.text = "(\(probText))" 265 | } else { 266 | cell.detailTextLabel?.text = "N/A" 267 | } 268 | return cell 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-ClassificationTests/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | $(DEVELOPMENT_LANGUAGE) 7 | CFBundleExecutable 8 | $(EXECUTABLE_NAME) 9 | CFBundleIdentifier 10 | $(PRODUCT_BUNDLE_IDENTIFIER) 11 | CFBundleInfoDictionaryVersion 12 | 6.0 13 | CFBundleName 14 | $(PRODUCT_NAME) 15 | CFBundlePackageType 16 | $(PRODUCT_BUNDLE_PACKAGE_TYPE) 17 | CFBundleShortVersionString 18 | 1.0 19 | CFBundleVersion 20 | 1 21 | 22 | 23 | -------------------------------------------------------------------------------- /Pytorch-CoreML-Sound-ClassificationTests/Pytorch_CoreML_Sound_ClassificationTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Pytorch_CoreML_Sound_ClassificationTests.swift 3 | // Pytorch-CoreML-Sound-ClassificationTests 4 | // 5 | // Created by Gerald on 6/14/20. 6 | // Copyright © 2020 Gerald. All rights reserved. 7 | // 8 | 9 | import XCTest 10 | import AVFoundation 11 | import CoreML 12 | 13 | @testable import Pytorch_CoreML_Sound_Classification 14 | 15 | class Pytorch_CoreML_Sound_ClassificationTests: XCTestCase { 16 | 17 | func test_labels() throws { 18 | 19 | let bundle = Bundle(for: Pytorch_CoreML_Sound_ClassificationTests.self) 20 | let path = bundle.path(forResource: "PANN_labels", ofType: "json") 21 | 22 | var labels: NSArray? 23 | if let JSONData = try? Data(contentsOf: URL(fileURLWithPath: path!)) { 24 | labels = try! JSONSerialization.jsonObject(with: JSONData, options: .mutableContainers) as? NSArray 25 | } 26 | /* 27 | for i in 0..(OpaquePointer(audioData.dataPointer)) 99 | for i in 0..] } 104 | let inputs: [String: Any] = [ 105 | inputName: audioData, 106 | ] 107 | // container for ML Model inputs 108 | let provider = try! MLDictionaryFeatureProvider(dictionary: inputs) 109 | 110 | // Send the wavefor samples into the model to generate the Spectrogram 111 | let raw_outputs = try! model.model.prediction(from: provider) 112 | 113 | // convert raw dictionary into our model's output object 114 | let outputs = NetworkOutput( features: raw_outputs ) 115 | 116 | 117 | let output_clipwise: MLMultiArray = outputs.clip_output 118 | print( "clip outputs: \(output_clipwise.shape)") // [1, 527] 119 | 120 | // sanity check the shapes of our output 121 | XCTAssertTrue( Int( truncating: output_clipwise.shape[1] ) == expected_clipwise.count, 122 | "incorrect shape[1]! \(output_clipwise.shape[1]) \(expected_clipwise.count)" ) 123 | 124 | 125 | // compare every element of our spectrogram with those from the JSON file 126 | for i in 0..(OpaquePointer(spectrogram.dataPointer)) 184 | for i in 0..= Float32(0.0), "converted spec min incorrect!" ) 200 | XCTAssertTrue( converted_spec[0].max()! <= Float32(1.0), "converted spec max incorrect!" ) 201 | 202 | } 203 | 204 | func test_inference_time() throws { 205 | // This is an example of a performance test case. 206 | let model = PANN() 207 | 208 | let array_shape: [NSNumber] = [1, 12800] 209 | let audioData = try! MLMultiArray(shape: array_shape, dataType: MLMultiArrayDataType.float32 ) 210 | let inputs: [String: Any] = [ 211 | "input.1": audioData, 212 | ] 213 | // container for ML Model inputs 214 | let provider = try! MLDictionaryFeatureProvider(dictionary: inputs) 215 | 216 | self.measure { 217 | // Put the code you want to measure the time of here. 218 | let N = 10 219 | let start_time = CACurrentMediaTime() 220 | let options = MLPredictionOptions() 221 | // options.usesCPUOnly = true 222 | for _ in 0.. 7 | 8 | Be sure to clone with --recursive: 9 | ``` 10 | git clone --recursive https://github.com/ml-illustrated/Pytorch-CoreML-Sound-Classification 11 | ``` 12 | -------------------------------------------------------------------------------- /python/export.log: -------------------------------------------------------------------------------- 1 | $ python3 python/export.py 'python/MobileNetV1_mAP=0.389.pth' /tmp/ring_hello.wav 2 | waveform: (1, 12800) 3 | 1/157: Converting Node Type Unsqueeze 4 | 2/157: Converting Node Type Unsqueeze 5 | 3/157: Converting Node Type Pad 6 | node.name: 177 [0, 0, 512, 0, 0, 0, 512, 0] 7 | 4/157: Converting Node Type Squeeze 8 | 5/157: Converting Node Type Conv 9 | 6/157: Converting Node Type Conv 10 | 7/157: Converting Node Type Unsqueeze 11 | 8/157: Converting Node Type Transpose 12 | 9/157: Converting Node Type Unsqueeze 13 | 10/157: Converting Node Type Transpose 14 | 11/157: Converting Node Type Pow 15 | 12/157: Converting Node Type Pow 16 | 13/157: Converting Node Type Add 17 | 14/157: Converting Node Type MatMul 18 | 15/157: Converting Node Type Clip 19 | 16/157: Converting Node Type Log 20 | 17/157: Converting Node Type Mul 21 | 18/157: Converting Node Type Div 22 | 19/157: Converting Node Type Transpose 23 | 20/157: Converting Node Type BatchNormalization 24 | 21/157: Converting Node Type Transpose 25 | 22/157: Converting Node Type Conv 26 | 23/157: Converting Node Type Pad 27 | node.name: 201 [0, 0, 0, 0, 0, 0, 0, 0] 28 | 24/157: Converting Node Type AveragePool 29 | 25/157: Converting Node Type BatchNormalization 30 | 26/157: Converting Node Type Relu 31 | 27/157: Converting Node Type Conv 32 | 28/157: Converting Node Type Pad 33 | node.name: 206 [0, 0, 0, 0, 0, 0, 0, 0] 34 | 29/157: Converting Node Type AveragePool 35 | 30/157: Converting Node Type BatchNormalization 36 | 31/157: Converting Node Type Relu 37 | 32/157: Converting Node Type Conv 38 | 33/157: Converting Node Type BatchNormalization 39 | 34/157: Converting Node Type Relu 40 | 35/157: Converting Node Type Conv 41 | 36/157: Converting Node Type Pad 42 | node.name: 214 [0, 0, 0, 0, 0, 0, 0, 0] 43 | 37/157: Converting Node Type AveragePool 44 | 38/157: Converting Node Type BatchNormalization 45 | 39/157: Converting Node Type Relu 46 | 40/157: Converting Node Type Conv 47 | 41/157: Converting Node Type BatchNormalization 48 | 42/157: Converting Node Type Relu 49 | 43/157: Converting Node Type Conv 50 | 44/157: Converting Node Type Pad 51 | node.name: 222 [0, 0, 0, 0, 0, 0, 0, 0] 52 | 45/157: Converting Node Type AveragePool 53 | 46/157: Converting Node Type BatchNormalization 54 | 47/157: Converting Node Type Relu 55 | 48/157: Converting Node Type Conv 56 | 49/157: Converting Node Type BatchNormalization 57 | 50/157: Converting Node Type Relu 58 | 51/157: Converting Node Type Conv 59 | 52/157: Converting Node Type Pad 60 | node.name: 230 [0, 0, 0, 0, 0, 0, 0, 0] 61 | 53/157: Converting Node Type AveragePool 62 | 54/157: Converting Node Type BatchNormalization 63 | 55/157: Converting Node Type Relu 64 | 56/157: Converting Node Type Conv 65 | 57/157: Converting Node Type BatchNormalization 66 | 58/157: Converting Node Type Relu 67 | 59/157: Converting Node Type Conv 68 | 60/157: Converting Node Type Pad 69 | node.name: 238 [0, 0, 0, 0, 0, 0, 0, 0] 70 | 61/157: Converting Node Type AveragePool 71 | 62/157: Converting Node Type BatchNormalization 72 | 63/157: Converting Node Type Relu 73 | 64/157: Converting Node Type Conv 74 | 65/157: Converting Node Type BatchNormalization 75 | 66/157: Converting Node Type Relu 76 | 67/157: Converting Node Type Conv 77 | 68/157: Converting Node Type Pad 78 | node.name: 246 [0, 0, 0, 0, 0, 0, 0, 0] 79 | 69/157: Converting Node Type AveragePool 80 | 70/157: Converting Node Type BatchNormalization 81 | 71/157: Converting Node Type Relu 82 | 72/157: Converting Node Type Conv 83 | 73/157: Converting Node Type BatchNormalization 84 | 74/157: Converting Node Type Relu 85 | 75/157: Converting Node Type Conv 86 | 76/157: Converting Node Type Pad 87 | node.name: 254 [0, 0, 0, 0, 0, 0, 0, 0] 88 | 77/157: Converting Node Type AveragePool 89 | 78/157: Converting Node Type BatchNormalization 90 | 79/157: Converting Node Type Relu 91 | 80/157: Converting Node Type Conv 92 | 81/157: Converting Node Type BatchNormalization 93 | 82/157: Converting Node Type Relu 94 | 83/157: Converting Node Type Conv 95 | 84/157: Converting Node Type Pad 96 | node.name: 262 [0, 0, 0, 0, 0, 0, 0, 0] 97 | 85/157: Converting Node Type AveragePool 98 | 86/157: Converting Node Type BatchNormalization 99 | 87/157: Converting Node Type Relu 100 | 88/157: Converting Node Type Conv 101 | 89/157: Converting Node Type BatchNormalization 102 | 90/157: Converting Node Type Relu 103 | 91/157: Converting Node Type Conv 104 | 92/157: Converting Node Type Pad 105 | node.name: 270 [0, 0, 0, 0, 0, 0, 0, 0] 106 | 93/157: Converting Node Type AveragePool 107 | 94/157: Converting Node Type BatchNormalization 108 | 95/157: Converting Node Type Relu 109 | 96/157: Converting Node Type Conv 110 | 97/157: Converting Node Type BatchNormalization 111 | 98/157: Converting Node Type Relu 112 | 99/157: Converting Node Type Conv 113 | 100/157: Converting Node Type Pad 114 | node.name: 278 [0, 0, 0, 0, 0, 0, 0, 0] 115 | 101/157: Converting Node Type AveragePool 116 | 102/157: Converting Node Type BatchNormalization 117 | 103/157: Converting Node Type Relu 118 | 104/157: Converting Node Type Conv 119 | 105/157: Converting Node Type BatchNormalization 120 | 106/157: Converting Node Type Relu 121 | 107/157: Converting Node Type Conv 122 | 108/157: Converting Node Type Pad 123 | node.name: 286 [0, 0, 0, 0, 0, 0, 0, 0] 124 | 109/157: Converting Node Type AveragePool 125 | 110/157: Converting Node Type BatchNormalization 126 | 111/157: Converting Node Type Relu 127 | 112/157: Converting Node Type Conv 128 | 113/157: Converting Node Type BatchNormalization 129 | 114/157: Converting Node Type Relu 130 | 115/157: Converting Node Type Conv 131 | 116/157: Converting Node Type Pad 132 | node.name: 294 [0, 0, 0, 0, 0, 0, 0, 0] 133 | 117/157: Converting Node Type AveragePool 134 | 118/157: Converting Node Type BatchNormalization 135 | 119/157: Converting Node Type Relu 136 | 120/157: Converting Node Type Conv 137 | 121/157: Converting Node Type BatchNormalization 138 | 122/157: Converting Node Type Relu 139 | 123/157: Converting Node Type Conv 140 | 124/157: Converting Node Type Pad 141 | node.name: 302 [0, 0, 0, 0, 0, 0, 0, 0] 142 | 125/157: Converting Node Type AveragePool 143 | 126/157: Converting Node Type BatchNormalization 144 | 127/157: Converting Node Type Relu 145 | 128/157: Converting Node Type Conv 146 | 129/157: Converting Node Type BatchNormalization 147 | 130/157: Converting Node Type Relu 148 | 131/157: Converting Node Type ReduceMean 149 | 132/157: Converting Node Type MaxPool 150 | 133/157: Converting Node Type Pad 151 | node.name: 311 [0, 0, 1, 0, 0, 1] 152 | 134/157: Converting Node Type AveragePool 153 | 135/157: Converting Node Type Add 154 | 136/157: Converting Node Type Transpose 155 | 137/157: Converting Node Type MatMul 156 | 138/157: Converting Node Type Add 157 | 139/157: Converting Node Type Relu 158 | 140/157: Converting Node Type MatMul 159 | 141/157: Converting Node Type Add 160 | 142/157: Converting Node Type Sigmoid 161 | 143/157: Converting Node Type ReduceMax 162 | 144/157: Converting Node Type Shape 163 | 145/157: Converting Node Type Gather 164 | 146/157: Converting Node Type Shape 165 | 147/157: Converting Node Type Gather 166 | 148/157: Converting Node Type Shape 167 | 149/157: Converting Node Type Gather 168 | 150/157: Converting Node Type Unsqueeze 169 | 151/157: Converting Node Type Tile 170 | 152/157: Converting Node Type Mul 171 | 153/157: Converting Node Type Unsqueeze 172 | 154/157: Converting Node Type Unsqueeze 173 | 155/157: Converting Node Type Unsqueeze 174 | 156/157: Converting Node Type Concat 175 | 157/157: Converting Node Type Reshape 176 | Translation to CoreML spec completed. Now compiling the CoreML model. 177 | Model Compilation done. 178 | out: (1, 527) 179 | out: (1, 32, 527) 180 | out: (1, 1, 41, 64) -------------------------------------------------------------------------------- /python/export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | import librosa 8 | #import onnx 9 | import onnx_coreml 10 | #import onnxruntime 11 | import coremltools 12 | 13 | sys.path.insert(1, os.path.join(sys.path[0], 'audioset_tagging_cnn/pytorch')) 14 | 15 | from audioset_tagging_cnn.pytorch.pytorch_utils import interpolate, pad_framewise_output 16 | 17 | from audioset_tagging_cnn.pytorch.models import MobileNetV1 18 | 19 | 20 | class MobileNetV1Export(MobileNetV1): 21 | def __init__(self, *args, **kwargs): 22 | 23 | super(MobileNetV1Export, self).__init__(*args, **kwargs) 24 | self.interpolate_ratio = 32 25 | 26 | self.input_name = 'input.1' 27 | self.output_names = ['clip_output', 'frame_output', 'melspec' ] 28 | 29 | 30 | def forward(self, x, mixup_lambda=None): 31 | x = self.spectrogram_extractor(x) # (batch_size, 1, time_steps, freq_bins) 32 | x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) 33 | melspec = x 34 | 35 | # frames_num = x.shape[2] 36 | 37 | x = x.transpose(1, 3) 38 | x = self.bn0(x) 39 | x = x.transpose(1, 3) 40 | 41 | x = self.features(x) 42 | 43 | x = torch.mean(x, dim=3) 44 | 45 | x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1) 46 | x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1) 47 | x = x1 + x2 48 | 49 | x = F.dropout(x, p=0.5, training=self.training) 50 | x = x.transpose(1, 2) 51 | x = F.relu_(self.fc1(x)) 52 | x = F.dropout(x, p=0.5, training=self.training) 53 | segmentwise_output = torch.sigmoid(self.fc_audioset(x)) 54 | (clipwise_output, _) = torch.max(segmentwise_output, dim=1) 55 | 56 | # Get framewise output 57 | framewise_output = interpolate(segmentwise_output, self.interpolate_ratio) 58 | # DISABLED framewise_output = pad_framewise_output(framewise_output, frames_num) 59 | 60 | return clipwise_output, framewise_output, melspec 61 | 62 | def gen_torch_output( self, sample_input ): 63 | # Forward 64 | with torch.no_grad(): 65 | raw_outputs = self( torch.from_numpy( sample_input ) ) 66 | torch_outputs = [ item.cpu().detach().numpy() for item in raw_outputs ] 67 | 68 | for output in torch_outputs: 69 | print( 'out: ', output.shape ) 70 | 71 | return torch_outputs 72 | 73 | def convert_to_onnx( self, filename_onnx, sample_input ): 74 | 75 | input_names = [ self.input_name ] 76 | output_names = self.output_names 77 | 78 | torch.onnx.export( 79 | self, 80 | torch.from_numpy( sample_input ), 81 | filename_onnx, 82 | input_names=input_names, 83 | output_names=output_names, 84 | verbose=False, 85 | # operator_export_type=OperatorExportTypes.ONNX 86 | ) 87 | 88 | def gen_onnx_outputs( self, filename_onnx, sample_input ): 89 | import onnxruntime 90 | 91 | session = onnxruntime.InferenceSession( filename_onnx, None) 92 | 93 | input_name = session.get_inputs()[0].name 94 | 95 | raw_results = session.run([], {input_name: sample_input}) 96 | 97 | return raw_results[0] 98 | 99 | def convert_to_coreml( self, fn_mlmodel, sample_input ): 100 | 101 | # first convert to ONNX 102 | filename_onnx = '/tmp/PANN_model.onnx' 103 | self.convert_to_onnx( filename_onnx, sample_input ) 104 | 105 | # onnx_outputs = self.gen_onnx_outputs( filename_onnx, sample_input ) 106 | 107 | # set up for Core ML export 108 | convert_params = dict( 109 | predicted_feature_name = [], 110 | minimum_ios_deployment_target='13', 111 | custom_conversion_functions={'Pad':_convert_pad}, 112 | ) 113 | 114 | mlmodel = onnx_coreml.convert( 115 | model=filename_onnx, 116 | **convert_params, 117 | ) 118 | 119 | # print(mlmodel._spec.description) 120 | 121 | # assert mlmodel != None, 'CoreML Conversion failed' 122 | 123 | mlmodel.save( fn_mlmodel ) 124 | 125 | torch_output = self.gen_torch_output( sample_input ) 126 | 127 | return torch_output 128 | 129 | """ 130 | model_inputs = { 131 | self.input_name : sample_input 132 | } 133 | # do forward pass 134 | mlmodel_outputs = mlmodel.predict(model_inputs, useCPUOnly=True) 135 | 136 | # fetch the spectrogram from output dictionary 137 | mlmodel_output = mlmodel_outputs[ self.output_names[0] ] 138 | # print( 'mlmodel_output: shape %s \nsample %s ' % ( mlmodel_output.shape, mlmodel_output[:,:,:3, :3] ) ) 139 | print( 'mlmodel_output: shape ', ( mlmodel_output.shape ) ) 140 | 141 | # mlmodel = coremltools.models.MLModel( fn_mlmodel ) 142 | # _ = coremltools.models.MLModel( mlmodel._spec ) 143 | """ 144 | 145 | 146 | def _convert_pad(builder, node, graph, err): 147 | from onnx_coreml._operators import _convert_pad as _convert_pad_orig 148 | 149 | pads = node.attrs['pads'] 150 | print( 'node.name: ', node.name, pads ) 151 | 152 | if node.name != '311': # hardcoded.. 153 | _convert_pad_orig( builder, node, graph, err ) 154 | 155 | else: 156 | 157 | params_dict = {} 158 | params_dict['left'] = pads[2] # padding left 159 | params_dict['right'] = pads[5] # padding right 160 | params_dict['top'] = 0 161 | params_dict['bottom'] = 0 162 | params_dict['value'] = 0.0 163 | params_dict['padding_type'] = 'constant' 164 | 165 | builder.add_padding( 166 | name=node.name, 167 | input_name=node.inputs[0], 168 | output_name=node.outputs[0], 169 | **params_dict, 170 | ) 171 | 172 | 173 | 174 | def save_class_label_json( fn_json ): 175 | import csv, json 176 | 177 | with open('python/audioset_tagging_cnn/metadata/class_labels_indices.csv', 'r') as f: 178 | reader = csv.reader(f, delimiter=',') 179 | lines = list(reader) 180 | 181 | labels = [] 182 | for i1 in range(1, len(lines)): 183 | id = lines[i1][1] 184 | label = lines[i1][2] 185 | labels.append(label) 186 | 187 | with open( fn_json, 'w' ) as ofp: 188 | json.dump( labels, ofp ) 189 | 190 | def export_model( fn_mlmodel, fn_json, fn_label_json, checkpoint_path, audio_path ): 191 | 192 | model_args = { 193 | 'sample_rate': 32000, 194 | 'window_size': 1024, 195 | 'hop_size': 320, 196 | 'mel_bins': 64, 197 | 'fmin': 50, 198 | 'fmax': 14000, 199 | 'classes_num': 527 200 | } 201 | model = MobileNetV1Export(**model_args) 202 | 203 | checkpoint = torch.load(checkpoint_path, map_location='cpu') 204 | model.load_state_dict(checkpoint['model']) 205 | model.eval() 206 | 207 | # Load audio 208 | sample_rate = model_args['sample_rate'] 209 | (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) 210 | 211 | sample_input = waveform[None, :sample_rate] # (1, audio_length) 212 | print( 'waveform: ', sample_input.shape ) # waveform: torch.Size([1, 224000]) 213 | 214 | model_outputs = model.convert_to_coreml( fn_mlmodel, sample_input ) 215 | 216 | save_model_output_as_json( fn_json, model_outputs ) 217 | 218 | save_class_label_json( fn_label_json ) 219 | 220 | def save_model_output_as_json( fn_output, model_outputs ): 221 | import json 222 | output_data = [ 223 | model_outputs[0][0,:].tolist(), # clipwise 224 | model_outputs[1][0,:].tolist(), # framewise 225 | model_outputs[2][0,0,:].tolist(), # melspec 226 | ] 227 | with open( fn_output, 'w' ) as fp: 228 | json.dump( output_data, fp ) 229 | 230 | 231 | if __name__ == '__main__': 232 | import sys 233 | checkpoint_path = sys.argv[1] 234 | audio_path = sys.argv[2] 235 | 236 | fn_mlmodel = '/tmp/PANN.mlmodel' 237 | fn_json = '/tmp/PANN_out.ring_hello.json' 238 | fn_label_json = '/tmp/PANN_labels.json' 239 | 240 | export_model( fn_mlmodel, fn_json, fn_label_json, checkpoint_path, audio_path ) 241 | 242 | # with CoreML convert error but still usable 243 | # python3 python/export.py 'python/MobileNetV1_mAP=0.389.pth' python/audioset_tagging_cnn/examples/R9_ZSCveAHg_7s.wav 244 | # without convert error, used in final version 245 | # python3 python/export.py 'python/MobileNetV1_mAP=0.389.pth' /tmp/ring_hello.wav 246 | # xcrun coremlc compile /tmp/PANN.mlmodel /tmp/mlc_output 247 | 248 | 249 | ''' 250 | import soundfile as sf 251 | 252 | fn_wav = 'python/audioset_tagging_cnn/examples/R9_ZSCveAHg_7s.wav' 253 | 254 | waveform, samplerate = sf.read( fn_wav ) 255 | # samplerate is 32000 256 | num_samples = 12800 257 | sample_input = waveform[ samplerate*2:samplerate*2+num_samples ] # sec 2 to 3 258 | 259 | sf.write( '/tmp/ring_hello.wav', sample_input, samplerate ) 260 | ''' 261 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.0.3 2 | soundfile==0.10.3.post1 3 | librosa==0.6.3 4 | torch==1.0.1.post2 5 | # torchlibrosa 6 | -e git://github.com/ml-illustrated/torchlibrosa.git#egg=torchlibrosa 7 | onnx==1.5.0 8 | onnx-coreml==1.3 9 | onnxruntime==1.3.0 10 | coremltools==3.4 11 | 12 | 13 | --------------------------------------------------------------------------------