├── images └── lousy_doc.png ├── ParallelDP.xcodeproj ├── project.xcworkspace │ └── contents.xcworkspacedata ├── xcuserdata │ └── jz.xcuserdatad │ │ └── xcschemes │ │ ├── xcschememanagement.plist │ │ └── ParallelDP.xcscheme └── project.pbxproj ├── ParallelDP ├── parameters.plist ├── main.swift └── Shaders.metal └── readme.md /images/lousy_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reijz/parallel-computing_Swift_Metal/HEAD/images/lousy_doc.png -------------------------------------------------------------------------------- /ParallelDP.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /ParallelDP.xcodeproj/xcuserdata/jz.xcuserdatad/xcschemes/xcschememanagement.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SchemeUserState 6 | 7 | ParallelDP.xcscheme 8 | 9 | orderHint 10 | 0 11 | 12 | 13 | SuppressBuildableAutocreation 14 | 15 | 2A0EA2641C362F8A003BF0CF 16 | 17 | primary 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ParallelDP/parameters.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ThreadExecutionWidth 6 | 512 7 | Capacity 8 | 24 9 | Dimension 10 | 4 11 | Mean_demand 12 | 20 13 | Deplete_threshold 14 | 20 15 | Periods 16 | 20 17 | HoldingCost 18 | 1.5 19 | SalvageValue 20 | 1 21 | OrderCost 22 | 5 23 | DisposalCost 24 | 1 25 | DiscountRate 26 | 1 27 | Price 28 | 10 29 | Distribution 30 | 31 | 2.06115362e-09 32 | 4.12230724e-08 33 | 4.12230724e-07 34 | 2.74820483e-06 35 | 1.37410241e-05 36 | 5.49640966e-05 37 | 0.000183213655 38 | 0.000523467587 39 | 0.00130866897 40 | 0.00290815326 41 | 0.00581630652 42 | 0.0105751028 43 | 0.0176251713 44 | 0.0271156481 45 | 0.0387366401 46 | 0.0516488535 47 | 0.06456106690000001 48 | 0.07595419639999999 49 | 0.0843935515 50 | 0.0888353174 51 | 0.0888353174 52 | 0.0846050642 53 | 0.07691369469999999 54 | 0.0668814737 55 | 0.0557345614 56 | 0.0445876491 57 | 0.0342981916 58 | 0.0254060679 59 | 0.0181471913 60 | 0.0125153044 61 | 0.00834353625 62 | 0.00538292661 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /ParallelDP.xcodeproj/xcuserdata/jz.xcuserdatad/xcschemes/ParallelDP.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 42 | 43 | 44 | 54 | 56 | 62 | 63 | 64 | 65 | 66 | 67 | 73 | 75 | 81 | 82 | 83 | 84 | 86 | 87 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /ParallelDP.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 46; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 2A0EA2691C362F8A003BF0CF /* main.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2A0EA2681C362F8A003BF0CF /* main.swift */; }; 11 | 2A0EA2701C3654EE003BF0CF /* Shaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = 2A0EA26F1C3654EE003BF0CF /* Shaders.metal */; }; 12 | /* End PBXBuildFile section */ 13 | 14 | /* Begin PBXCopyFilesBuildPhase section */ 15 | 2A0EA2631C362F8A003BF0CF /* CopyFiles */ = { 16 | isa = PBXCopyFilesBuildPhase; 17 | buildActionMask = 2147483647; 18 | dstPath = /usr/share/man/man1/; 19 | dstSubfolderSpec = 0; 20 | files = ( 21 | ); 22 | runOnlyForDeploymentPostprocessing = 1; 23 | }; 24 | /* End PBXCopyFilesBuildPhase section */ 25 | 26 | /* Begin PBXFileReference section */ 27 | 2A0EA2651C362F8A003BF0CF /* ParallelDP */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = ParallelDP; sourceTree = BUILT_PRODUCTS_DIR; }; 28 | 2A0EA2681C362F8A003BF0CF /* main.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = main.swift; sourceTree = ""; }; 29 | 2A0EA26F1C3654EE003BF0CF /* Shaders.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shaders.metal; sourceTree = ""; }; 30 | 2A6953E51C5793BE00C77AD8 /* parameters.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = parameters.plist; sourceTree = ""; }; 31 | /* End PBXFileReference section */ 32 | 33 | /* Begin PBXFrameworksBuildPhase section */ 34 | 2A0EA2621C362F8A003BF0CF /* Frameworks */ = { 35 | isa = PBXFrameworksBuildPhase; 36 | buildActionMask = 2147483647; 37 | files = ( 38 | ); 39 | runOnlyForDeploymentPostprocessing = 0; 40 | }; 41 | /* End PBXFrameworksBuildPhase section */ 42 | 43 | /* Begin PBXGroup section */ 44 | 2A0EA25C1C362F8A003BF0CF = { 45 | isa = PBXGroup; 46 | children = ( 47 | 2A0EA2671C362F8A003BF0CF /* ParallelDP */, 48 | 2A0EA2661C362F8A003BF0CF /* Products */, 49 | ); 50 | sourceTree = ""; 51 | }; 52 | 2A0EA2661C362F8A003BF0CF /* Products */ = { 53 | isa = PBXGroup; 54 | children = ( 55 | 2A0EA2651C362F8A003BF0CF /* ParallelDP */, 56 | ); 57 | name = Products; 58 | sourceTree = ""; 59 | }; 60 | 2A0EA2671C362F8A003BF0CF /* ParallelDP */ = { 61 | isa = PBXGroup; 62 | children = ( 63 | 2A0EA2681C362F8A003BF0CF /* main.swift */, 64 | 2A0EA26F1C3654EE003BF0CF /* Shaders.metal */, 65 | 2A6953E51C5793BE00C77AD8 /* parameters.plist */, 66 | ); 67 | path = ParallelDP; 68 | sourceTree = ""; 69 | }; 70 | /* End PBXGroup section */ 71 | 72 | /* Begin PBXNativeTarget section */ 73 | 2A0EA2641C362F8A003BF0CF /* ParallelDP */ = { 74 | isa = PBXNativeTarget; 75 | buildConfigurationList = 2A0EA26C1C362F8A003BF0CF /* Build configuration list for PBXNativeTarget "ParallelDP" */; 76 | buildPhases = ( 77 | 2A0EA2611C362F8A003BF0CF /* Sources */, 78 | 2A0EA2621C362F8A003BF0CF /* Frameworks */, 79 | 2A0EA2631C362F8A003BF0CF /* CopyFiles */, 80 | ); 81 | buildRules = ( 82 | ); 83 | dependencies = ( 84 | ); 85 | name = ParallelDP; 86 | productName = ParallelDP; 87 | productReference = 2A0EA2651C362F8A003BF0CF /* ParallelDP */; 88 | productType = "com.apple.product-type.tool"; 89 | }; 90 | /* End PBXNativeTarget section */ 91 | 92 | /* Begin PBXProject section */ 93 | 2A0EA25D1C362F8A003BF0CF /* Project object */ = { 94 | isa = PBXProject; 95 | attributes = { 96 | LastSwiftUpdateCheck = 0720; 97 | LastUpgradeCheck = 0720; 98 | ORGANIZATIONNAME = verse; 99 | TargetAttributes = { 100 | 2A0EA2641C362F8A003BF0CF = { 101 | CreatedOnToolsVersion = 7.2; 102 | }; 103 | }; 104 | }; 105 | buildConfigurationList = 2A0EA2601C362F8A003BF0CF /* Build configuration list for PBXProject "ParallelDP" */; 106 | compatibilityVersion = "Xcode 3.2"; 107 | developmentRegion = English; 108 | hasScannedForEncodings = 0; 109 | knownRegions = ( 110 | en, 111 | ); 112 | mainGroup = 2A0EA25C1C362F8A003BF0CF; 113 | productRefGroup = 2A0EA2661C362F8A003BF0CF /* Products */; 114 | projectDirPath = ""; 115 | projectRoot = ""; 116 | targets = ( 117 | 2A0EA2641C362F8A003BF0CF /* ParallelDP */, 118 | ); 119 | }; 120 | /* End PBXProject section */ 121 | 122 | /* Begin PBXSourcesBuildPhase section */ 123 | 2A0EA2611C362F8A003BF0CF /* Sources */ = { 124 | isa = PBXSourcesBuildPhase; 125 | buildActionMask = 2147483647; 126 | files = ( 127 | 2A0EA2701C3654EE003BF0CF /* Shaders.metal in Sources */, 128 | 2A0EA2691C362F8A003BF0CF /* main.swift in Sources */, 129 | ); 130 | runOnlyForDeploymentPostprocessing = 0; 131 | }; 132 | /* End PBXSourcesBuildPhase section */ 133 | 134 | /* Begin XCBuildConfiguration section */ 135 | 2A0EA26A1C362F8A003BF0CF /* Debug */ = { 136 | isa = XCBuildConfiguration; 137 | buildSettings = { 138 | ALWAYS_SEARCH_USER_PATHS = NO; 139 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 140 | CLANG_CXX_LIBRARY = "libc++"; 141 | CLANG_ENABLE_MODULES = YES; 142 | CLANG_ENABLE_OBJC_ARC = YES; 143 | CLANG_WARN_BOOL_CONVERSION = YES; 144 | CLANG_WARN_CONSTANT_CONVERSION = YES; 145 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 146 | CLANG_WARN_EMPTY_BODY = YES; 147 | CLANG_WARN_ENUM_CONVERSION = YES; 148 | CLANG_WARN_INT_CONVERSION = YES; 149 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 150 | CLANG_WARN_UNREACHABLE_CODE = YES; 151 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 152 | CODE_SIGN_IDENTITY = "-"; 153 | COPY_PHASE_STRIP = NO; 154 | DEBUG_INFORMATION_FORMAT = dwarf; 155 | ENABLE_STRICT_OBJC_MSGSEND = YES; 156 | ENABLE_TESTABILITY = YES; 157 | GCC_C_LANGUAGE_STANDARD = gnu99; 158 | GCC_DYNAMIC_NO_PIC = NO; 159 | GCC_NO_COMMON_BLOCKS = YES; 160 | GCC_OPTIMIZATION_LEVEL = 0; 161 | GCC_PREPROCESSOR_DEFINITIONS = ( 162 | "DEBUG=1", 163 | "$(inherited)", 164 | ); 165 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 166 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 167 | GCC_WARN_UNDECLARED_SELECTOR = YES; 168 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 169 | GCC_WARN_UNUSED_FUNCTION = YES; 170 | GCC_WARN_UNUSED_VARIABLE = YES; 171 | MACOSX_DEPLOYMENT_TARGET = 10.11; 172 | MTL_ENABLE_DEBUG_INFO = YES; 173 | ONLY_ACTIVE_ARCH = YES; 174 | SDKROOT = macosx; 175 | SWIFT_OPTIMIZATION_LEVEL = "-Onone"; 176 | }; 177 | name = Debug; 178 | }; 179 | 2A0EA26B1C362F8A003BF0CF /* Release */ = { 180 | isa = XCBuildConfiguration; 181 | buildSettings = { 182 | ALWAYS_SEARCH_USER_PATHS = NO; 183 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 184 | CLANG_CXX_LIBRARY = "libc++"; 185 | CLANG_ENABLE_MODULES = YES; 186 | CLANG_ENABLE_OBJC_ARC = YES; 187 | CLANG_WARN_BOOL_CONVERSION = YES; 188 | CLANG_WARN_CONSTANT_CONVERSION = YES; 189 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 190 | CLANG_WARN_EMPTY_BODY = YES; 191 | CLANG_WARN_ENUM_CONVERSION = YES; 192 | CLANG_WARN_INT_CONVERSION = YES; 193 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 194 | CLANG_WARN_UNREACHABLE_CODE = YES; 195 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 196 | CODE_SIGN_IDENTITY = "-"; 197 | COPY_PHASE_STRIP = NO; 198 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 199 | ENABLE_NS_ASSERTIONS = NO; 200 | ENABLE_STRICT_OBJC_MSGSEND = YES; 201 | GCC_C_LANGUAGE_STANDARD = gnu99; 202 | GCC_NO_COMMON_BLOCKS = YES; 203 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 204 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 205 | GCC_WARN_UNDECLARED_SELECTOR = YES; 206 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 207 | GCC_WARN_UNUSED_FUNCTION = YES; 208 | GCC_WARN_UNUSED_VARIABLE = YES; 209 | MACOSX_DEPLOYMENT_TARGET = 10.11; 210 | MTL_ENABLE_DEBUG_INFO = NO; 211 | SDKROOT = macosx; 212 | }; 213 | name = Release; 214 | }; 215 | 2A0EA26D1C362F8A003BF0CF /* Debug */ = { 216 | isa = XCBuildConfiguration; 217 | buildSettings = { 218 | PRODUCT_NAME = "$(TARGET_NAME)"; 219 | }; 220 | name = Debug; 221 | }; 222 | 2A0EA26E1C362F8A003BF0CF /* Release */ = { 223 | isa = XCBuildConfiguration; 224 | buildSettings = { 225 | PRODUCT_NAME = "$(TARGET_NAME)"; 226 | }; 227 | name = Release; 228 | }; 229 | /* End XCBuildConfiguration section */ 230 | 231 | /* Begin XCConfigurationList section */ 232 | 2A0EA2601C362F8A003BF0CF /* Build configuration list for PBXProject "ParallelDP" */ = { 233 | isa = XCConfigurationList; 234 | buildConfigurations = ( 235 | 2A0EA26A1C362F8A003BF0CF /* Debug */, 236 | 2A0EA26B1C362F8A003BF0CF /* Release */, 237 | ); 238 | defaultConfigurationIsVisible = 0; 239 | defaultConfigurationName = Release; 240 | }; 241 | 2A0EA26C1C362F8A003BF0CF /* Build configuration list for PBXNativeTarget "ParallelDP" */ = { 242 | isa = XCConfigurationList; 243 | buildConfigurations = ( 244 | 2A0EA26D1C362F8A003BF0CF /* Debug */, 245 | 2A0EA26E1C362F8A003BF0CF /* Release */, 246 | ); 247 | defaultConfigurationIsVisible = 0; 248 | defaultConfigurationName = Release; 249 | }; 250 | /* End XCConfigurationList section */ 251 | }; 252 | rootObject = 2A0EA25D1C362F8A003BF0CF /* Project object */; 253 | } 254 | -------------------------------------------------------------------------------- /ParallelDP/main.swift: -------------------------------------------------------------------------------- 1 | // 2 | // main.swift 3 | // ParallelDP 4 | // 5 | // Created by Jiheng Zhang on 1/1/2016. 6 | // Copyright © 2016 verse. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | import MetalKit 11 | 12 | let fileManager = NSFileManager.defaultManager() 13 | 14 | // Specify the plist file while running 15 | let path = fileManager.currentDirectoryPath 16 | let args = Process.arguments 17 | if (args.count != 2) { 18 | print("Please specify the plist file for paremeters!") 19 | exit(1) 20 | } 21 | let plistPath = path + "/" + args[1] 22 | //print(plistPath) 23 | 24 | if !fileManager.fileExistsAtPath(plistPath) { 25 | print("Cannot find plist file!") 26 | exit(1) 27 | } 28 | 29 | // Reading paremeters from plist 30 | let dict = NSDictionary(contentsOfFile: plistPath) 31 | // print(dict) 32 | let numPeriods: Int! = dict!.valueForKey("Periods") as? Int 33 | let mean_demand: Int! = dict!.valueForKey("Mean_demand") as? Int 34 | let deplete_threshold: Int! = dict!.valueForKey("Deplete_threshold") as? Int 35 | let L: Int! = dict!.valueForKey("Dimension") as? Int 36 | let K: Int! = dict!.valueForKey("Capacity") as? Int 37 | let holdingCost: Float! = dict!.valueForKey("HoldingCost") as? Float 38 | let salvageValue: Float! = dict!.valueForKey("SalvageValue") as? Float 39 | let orderCost: Float! = dict!.valueForKey("OrderCost") as? Float 40 | let disposalCost: Float! = dict!.valueForKey("DisposalCost") as? Float 41 | let discountRate: Float! = dict!.valueForKey("DiscountRate") as? Float 42 | let price: Float! = dict!.valueForKey("Price") as? Float 43 | let dist: [Float]! = dict!.valueForKey("Distribution") as? [Float] 44 | // Allow user to specify threadExecutionWidth depending on their own hardware 45 | let threadExecutionWidth: Int! = dict!.valueForKey("ThreadExecutionWidth") as? Int 46 | 47 | print("The complexity is with Capacity", K, "and Dimension", L) 48 | 49 | let max_demand: Float = Float(dist.count) 50 | // The order matters 51 | let paramemterVector: [Float] = [ 52 | Float(K), 53 | Float(L), 54 | salvageValue, 55 | holdingCost, 56 | orderCost, 57 | disposalCost, 58 | discountRate, 59 | price, 60 | max_demand, 61 | Float(mean_demand), 62 | Float(numPeriods), 63 | Float(deplete_threshold) 64 | ] 65 | 66 | // basic calcuation of buffer 67 | let numberOfStates = Int(pow(Double(K), Double(L))) 68 | let unitSize = sizeof(Float) 69 | let resultBufferSize = numberOfStates*unitSize 70 | 71 | // basic calculation of device related parameter 72 | let numThreadsPerGroup = MTLSize(width:threadExecutionWidth,height:1,depth:1) 73 | 74 | // Initialize Metal 75 | // Get the default device, which is the same as the one monitor is using 76 | var device: MTLDevice! = MTLCreateSystemDefaultDevice() 77 | // In the following, choose the device NOT used by monitor 78 | let devices: [MTLDevice] = MTLCopyAllDevices() 79 | for metalDevice: MTLDevice in devices { 80 | if metalDevice.headless == true { 81 | device = metalDevice 82 | } 83 | } 84 | // exit with an error message if all devices are used by monitor 85 | if !device.headless { 86 | print("no dedicated device found") 87 | exit(1) 88 | } 89 | 90 | // Build command queue 91 | var commandQueue: MTLCommandQueue! = device.newCommandQueue() 92 | 93 | // Allocate memory on device 94 | let resourceOption = MTLResourceOptions() 95 | var buffer:[MTLBuffer] = [ 96 | device.newBufferWithLength(resultBufferSize, options: resourceOption), 97 | device.newBufferWithLength(resultBufferSize, options: resourceOption), 98 | device.newBufferWithLength(resultBufferSize, options: resourceOption), // depletion action 99 | device.newBufferWithLength(resultBufferSize, options: resourceOption) // order action 100 | ] 101 | var parameterBuffer:MTLBuffer = device.newBufferWithBytes(paramemterVector, length: unitSize*paramemterVector.count, options: resourceOption) 102 | // put distriburion buffer here 103 | var distributionBuffer:MTLBuffer = device.newBufferWithBytes(dist, length: unitSize*dist.count, options: resourceOption) 104 | 105 | // Get functions from Shaders and add to MTL library 106 | var DPLibrary: MTLLibrary! = device.newDefaultLibrary() 107 | let initDP = DPLibrary.newFunctionWithName("initialize") 108 | let pipelineFilterInit = try device.newComputePipelineStateWithFunction(initDP!) 109 | let iterateDP = DPLibrary.newFunctionWithName("iterate") 110 | let pipelineFilterIterate = try device.newComputePipelineStateWithFunction(iterateDP!) 111 | let iterateFluid = DPLibrary.newFunctionWithName("iterate_fluid") 112 | let pipelineFilterIterate_fluid = try device.newComputePipelineStateWithFunction(iterateFluid!) 113 | let iterateNVP = DPLibrary.newFunctionWithName("iterate_NVP") 114 | let pipelineFilterIterate_NVP = try device.newComputePipelineStateWithFunction(iterateNVP!) 115 | let iterateNVP_1 = DPLibrary.newFunctionWithName("iterate_NVP_1") 116 | let pipelineFilterIterate_NVP_1 = try device.newComputePipelineStateWithFunction(iterateNVP_1!) 117 | 118 | var start = NSDate() 119 | // Initialize 120 | for l: Int in 1...L { 121 | 122 | let batchSize:uint = uint(pow(Float(K),Float(l-1))) 123 | let numGroupsBatch = MTLSize(width:(Int(batchSize)+threadExecutionWidth-1)/threadExecutionWidth, height:1, depth:1) 124 | // print("Batch Size = ", batchSize) 125 | 126 | for batchIndex: uint in 1.. It was written on 2016-02-08, and posted on my jekyll site. Since I have retired my jekyll site, better to keep a copy here. 4 | 5 | 6 | In 4 weeks of part-time effort, with zero knowledge in Swift/Metal but the help of the Internet and curiosity, I evolved a [sample code](http://memkite.com/blog/2014/12/15/data-parallel-programming-with-metal-and-swift-for-iphoneipad-gpu) found on the Internet into a full feature and easy-to-use and -modify code for one of my research problems requiring large scale parallel computing. Please feel free to check out the Swift/Metal code from [my github repository](https://github.com/reijz/ParallelComputing-Swift-Metal) and let me know any feedback you may have. 7 | 8 | ## How did it get started 9 | 10 | Recently my colleagues and I have been working on *approximate dynamic programming* with applications in perishable inventory management. At some point, we needed to perform numerical experiments to demonstrate the nice theories and approximation algorithms we created to solve the problem (at least approximately). 11 | 12 | The computing task in a nut shell, putting aside the mathematics, is to initialize a large array of size $K^L$ (e.g., $K=9$, $L=8$) and then iterate the array according to some rule for a few (say $T$) times. **The initialization and each iteration can be done in parallel**. 13 | 14 | Nvidia has made a lot of buzz in the field of parallel computing (or Artificial Intelligence, Deep Learning, Neural-Network, or whatever buzz word they call it) by utilizing large number (about 2000) of computing cores in their video cards. Though I haven't coded using C/C++ for several years, I know the existence of parallel computing platforms such as CUDA and OpenCL, with the former making much louder sound. 15 | 16 | In the spring of 2015, I hired two smart kids major in CS and ECE on our campus to write a computer program for our mathematical model using CUDA. After several months of communication involving my PhD student (who knows deep in the math but not much on coding), the two programmers and myself, we got a working code in CUDA which runs blazing fast if you imagine how long such computing task would take with a traditional single thread CPU computing. 17 | 18 | 19 | ## An adventure 20 | 21 | However, the CUDA code was not written in a clean way making further changes quite difficult. As the research on theories develops, we need modification on the algorithms from time to time, but programmers will never be so freely available. More importantly, I got curious about the parallel computing, and wanted to know what's going on there. I managed to hire one of the programmers to continue work for me in the summer of 2015, as the other got an intern job at Tensent, a tech giant in China. We aimed to create a clean version of code using OpenCL as I have a top configured MacPro (2013, a.k.a. "trash can"), which does not support CUDA. We failed that task. One reason is that OpenCL is indeed a lot harder to use than CUDA, although it can harness both CPU and GPU power. This is how [Metal](https://developer.apple.com/metal/), an OpenGL/CL replacement on Mac OS and much more beyond in my view, comes into play. The project turns out to be a nice adventure. 22 | 23 | > Things are NOT as difficult as they appear to be! 24 | 25 | We are probably living in the best age in the human history, despite pollution, inequality, poverty, etc., which anyway exist all time. The Internet and curiosity can teach you pretty much everything and enable you to get things done. The following are the main steps I took in this effort. 26 | 27 | 1. Naturally, I started by reading Apple's [Metal Programming Guide](https://developer.apple.com/library/ios/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html), which was not a terribly long document, but fun and easy to read. Essentially, one just needs to focus on the chapter "Data-Parallel Compute Processing: Compute Command Encoder", which clearly instructed a standard procedure to do parallel computing. The document did not provide any sample code, and I happen to start with absolutely zero knowledge in Swift. So I found a swift [programming course](https://www.coursera.org/learn/swift-programming) on Coursera and went through the first three weeks' content. I also skimmed through the [The Swift Programming Language](https://swift.org/documentation/TheSwiftProgrammingLanguage(Swift2.2).epub) book published by Apple. 28 | 29 | 2. At the turn of 2015 to 2016, only a handful of related results can be found by Googling keywords like "parallel computing swift metal". The article written by [Amund Tveit](https://www.linkedin.com/in/amundtveit) a year ago, [Data-Parallel Programming with Metal and Swift for iPhone/iPad GPU](http://memkite.com/blog/2014/12/15/data-parallel-programming-with-metal-and-swift-for-iphoneipad-gpu), is particularly helpful. With the help of the official [Metal Programming Guide](https://developer.apple.com/library/ios/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html) and some googling, I managed to compile and run Amund's sample code with some [minor tweaks](### Swift is a developing language). 30 | 31 | The code basically utilizes Swift/Metal to compute the Sigmund function for a large number of input values in parallel. However, this is far from what I need, since my algorithm requires full control of assigning computing cores to individual elements in the array and more. 32 | 33 | 3. I found a nice [CUDA course](https://www.coursera.org/course/hetero) on Coursera and studied the first five weeks' content since Apple's [Metal Programming Guide](https://developer.apple.com/library/ios/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html) stated the prerequisites "You should be familiar with the Objective-C language and experienced in programming with OpenGL, OpenCL, or similar APIs." 34 | Despite all the grammar and formality, the key thing I learned from this course is "linearity", or how to assign computing tasks to cores. This is exactly what I needed to know. 35 | With this knowledge, I studied carefully the CUDA code written by my programmers. 36 | 37 | 4. Metal seems to take quite a different approach than CUDA. For example, CUDA has three function type qualifiers, `__divice__`, `__global__` and `__host__`, which determines where the function can be called and executed. This enables quite some freedom in terms of structuring the code and designing the algorithm. I am sure the real "Kongfu" masters can write beautiful code with such freedom. But for less talented general public, this seems overly complicated and may make code hard to read/debug/modify. Briefly, `__Global__` functions can be called from the host side using CUDA kernel call semantics `<<>>`, while `__device__` functions can only be called from other device or global functions. The `grid_dim` and `block_dim` help to specific which cores compute which data. 38 | 39 | The Metal framework seems to build a "wall" between the device and host. Those functions on device are written in Objective C (with some limitations such as dynamic memory allocation is forbidden) while functions on host are written in Swift [^objectivec]. You don't **call** a device function, you only **schedule** the device function. 40 | For the device functions, there are some built-in input arguments, such as 41 | 42 | | Attribute Qualifier | 43 | | ------------- | 44 | | `[[ thread_position_in_grid ]]` | 45 | | `[[ thread_position_in_threadgroup ]]` | 46 | | `[[ thread_index_in_threadgroup ]]` | 47 | | `[[ threadgroup_position_in_grid ]]` | 48 | | `[[ threads_per_grid ]]` | 49 | | `[[ threads_per_threadgroup ]]` | 50 | | `[[ threadgroups_per_grid ]]` | 51 | | `[[ thread_execution_width ]]` | 52 | 53 | In other words, you never need to explicitly transmit the information about grid dimension and block dimension. I wrote the following code in order to test my understanding. 54 | 55 | Another disadvantage of such a "wall" is to make transmission of parameters inconvenient. And you certainly do not want to specify a set of parameters twice in both the Objective C for device and Swift for host. A workaround is to use shared buffer to transmit parameters. See my [code](https://github.com/reijz/ParallelComputing-Swift-Metal) for example. 56 | 57 | [^objectivec]: For those who prefer to use Objective C over Swift, they can still write function on host using Objective C. But the wall still exists. 58 | 59 | 60 | ```swift 61 | // to test the understanding of thread related concepts 62 | kernel void testThread(device float *result [[ buffer(0) ]], 63 | uint id [[ thread_position_in_grid ]], 64 | uint i [[ thread_position_in_threadgroup ]], 65 | uint w [[ threadgroup_position_in_grid ]], 66 | uint S [[ threads_per_threadgroup ]]) { 67 | 68 | if (id == w*S+i) 69 | result[id] = id; 70 | else 71 | result[id] = 0; 72 | } 73 | ``` 74 | 75 | 76 | 77 | ## Beyond this project 78 | 79 | Well, I got a nice code for my research project. And my PhD student is using it to do extensive numerical experiments on my Mac Pro. On the way, I picked up a new programming language and learned a lot more. I also involved my PhD student in this adventure. Now he seems feel more comfortable with coding. 80 | 81 | More importantly, this adventure triggers some thoughts on various related issues. I would like to share one, that is **knowing more enables you to see things clearly**. 82 | 83 | Coming back to Nvidia, "the" major player in parallel computing, they seem to have a line of both hardware and software products. Tesla K40 GPU seems to be a popular choice for high performance computing. As of January 2016, I got the following estimated price quote 84 | 85 | 86 | | |GPU Compute Node|Top configured MacPro| 87 | | ------------- | ------------- | ------------- | 88 | | Price | 11,000 USD | 9,000 USD | 89 | | CPU | 2 x Intel Xeon E5-2670 | 1 x Intel Xeon E5-2697 | 90 | | Memory | 64G | 64G | 91 | | GPU | 2 x Nvidia Tesla K40 | 2 x AMD FirePro D700 | 92 | | Hard disk | 1.2 TB 10K rpm | 1 TB SSD | 93 | | Something I don't know | 2 x GE network interfaces | | 94 | | Something I don't know | 1 x 56Gb Infiniband adapter | | 95 | 96 | **The notoriously expensive Apple product is cheaper by 20%.** Ok, the MacPro only has one Intel Xeon E5 (12-core) processor instead of two, but it has 1TB flash drive instead of traditional spinning hard disk. Anyway, it's fair to say these two configurations are comparable. 97 | 98 | I know you may want to talk about the **performance**. Since I have both CUDA and Swift/Metal codes, I did the comparison based on computing task for my research problem. For exactly the same set of [parameters]({{ site.url }}{{ site.asset }}/CUDA/parameters.plist), the following table shows the computing time (in seconds) on our university's Tesla K40 server and my MacPro, both using only a single GPU and float precision (instead of double precision). 99 | 100 | | `threadExecutionWidth` | Nvidia Tesla M2090| Nvidia Tesla K20 | Nvidia Tesla K40 | AMD FirePro D700 on MacPro | NVIDIA GeForce GT 650M on MacBook Pro (2012)| 101 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 102 | | 1 | 446 | 2451 | | 728 | NA | 103 | | 2 | 259 | 1406 | | 403 | NA | 104 | | 4 | 166 | 867 | | 217 | NA | 105 | | 8 | 124 | 307 | | 123 | NA | 106 | | 16 | 105 | 332 | | 87 | 245 | 107 | | 32 | **99** | 212 | | 99 | 131 | 108 | | 64 | 109 | 215 | | 106 | 88 | 109 | | 128 | 108 | **214** | | 106 | **79** | 110 | | 256 | 110 | **214** | | 107 | 81 | 111 | | 512 | 111 | **214** | | 108 | 84 | 112 | | 1024 | NA[^thread] | 215 | | 104 | 99 | 113 | | 2048 | NA | NA | | **87.36** | NA | 114 | 115 | The `threadExecutionWidth` is set in CUDA 116 | 117 | ```c++ 118 | *block_dim = dim3(width:threadExecutionWidth, 1, 1); 119 | ``` 120 | 121 | and is set in Swift/Metal 122 | 123 | ```swift 124 | let numThreadsPerGroup = MTLSize(width:threadExecutionWidth,height:1,depth:1) 125 | ``` 126 | 127 | As of writing this article, I still do not understand how would the `threadExecutionWidth` affect the speed. In theory, it should not. But the little experiment I did in the above table show the other way. And it is not even monotone, larger `threadExecutionWidth` faster speed, as generally perceived. 128 | 129 | [^thread]: My programmer told me that `threadExecutionWidth` cannot be higher than 512 on Tesla M2090. [http://stackoverflow.com/questions/5062781/cuda-max-threads-in-a-block](http://stackoverflow.com/questions/5062781/cuda-max-threads-in-a-block) And Indeed, setting an NA `threadExecutionWidth`, the CUDA and swift code will compile and run, but yield completely wrong results. 130 | 131 | 132 | Yes, my **MacPro is about 10% faster than Tesla M2090 GPU server, and much fast than K20.** In addition to performing the parallel computing task, my MacPro serves as github and VPN server for my whole research team. My team also run traditional single thread Python and Java code on this machine quite frequently. Moreover, I use it as a day-to-day computer, typing this article for example. Don't forget that I could do professional video/photo/sound editing and play stunning 3D game on this machine had I have that skill set. As a final remark, my MacPro, with only 44W when Idle and 270W at CPU Max[^power], is a lot more energy efficient than most servers on the market. 133 | 134 | However, it is surprising that the 2,500 USD MacBook Pro 15" purchased in 2012 is the fastest in this test! There are simply many things we, as general users of those parellel computing platform, don't know. 135 | 136 | [^power]: The MacPro power consumption can be found at [https://support.apple.com/en-us/HT201796](https://support.apple.com/en-us/HT201796) 137 | 138 | I am not surprised that Nvidia loses to Apple on scientific parallel computing using GPU, which is not a focus of Apple. Here is a screenshot I took while browsing the official [CUDA Document](http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3zfjtApED). 139 | 140 | ![drawing](./images/lousy_doc.png) 141 | 142 | Did you notice the red line under those words like "Quadro" and "Mathematica"? The table is a picture, presumably prepared using something like Word which highlights any misspelling. What an irresponsible team would do such a lousy job! I have problem believing how well things can be designed and optimized underneath the hood of the CUDA system. It seems Nvidia focus too much on marketing, since I've heard so much about it while its document is so lousy. You may take a look at the Apple document, e.g., [Metal Shading Language Guide](https://developer.apple.com/library/ios/documentation/Metal/Reference/MetalShadingLanguageGuide/Introduction/Introduction.html), just for comparison. I don't mean that giving the document a face lift would make the product great. What I mean is that we need to focus on making truely great stuff in every aspect we could possibly think of. -------------------------------------------------------------------------------- /ParallelDP/Shaders.metal: -------------------------------------------------------------------------------- 1 | // 2 | // Shaders.metal 3 | // ParallelDP 4 | // 5 | // Created by Jiheng Zhang on 1/1/2016. 6 | // Copyright © 2016 verse. All rights reserved. 7 | // 8 | 9 | #include 10 | using namespace metal; 11 | 12 | #define max_dimension 13 13 | 14 | kernel void initialize(const device uint *batch[[buffer(1)]], 15 | const device float *parameters[[buffer(2)]], 16 | device float *initValue [[buffer(0)]], 17 | uint id [[ thread_position_in_grid ]]) { 18 | 19 | // get the parameters 20 | float salvageValue = parameters[2]; 21 | 22 | // find current and parend id 23 | uint idCurrent = batch[0]*batch[1]+id; 24 | uint idParent = idCurrent - batch[0]; 25 | 26 | initValue[idCurrent] = initValue[idParent] + salvageValue; // discountRate * orderCost - holdingCost; 27 | } 28 | 29 | // compute the value funciton of the optimal policy 30 | kernel void iterate(const device uint *batch[[buffer(4)]], 31 | const device float *parameters[[buffer(5)]], 32 | const device float *distribution[[buffer(6)]], 33 | const device float *inVector [[buffer(0)]], 34 | device float *outVector [[ buffer(1) ]], 35 | device float *deplete[[buffer(2)]], 36 | device float *order[[buffer(3)]], 37 | uint id [[ thread_position_in_grid ]]) { 38 | 39 | // get the parameters 40 | int K = int(parameters[0]), L = int(parameters[1]); 41 | int max_demand = int(parameters[8]); 42 | float salvageValue = parameters[2]; 43 | float holdingCost = parameters[3]; 44 | float orderCost = parameters[4]; 45 | float disposalCost = parameters[5]; 46 | float discountRate = parameters[6]; 47 | float price = parameters[7]; 48 | 49 | // find current and parend id 50 | uint idCurrent = batch[0]*batch[1]+id; 51 | uint idParent = idCurrent - batch[0]; 52 | 53 | // prepare a vector for decode 54 | int idState[max_dimension + 1]; 55 | 56 | // range of optimization 57 | int min_deplete = 0, max_deplete = 1; 58 | int min_order = 0, max_order = K; 59 | 60 | if (idCurrent != 0){ 61 | min_deplete = int(deplete[idParent]) + int(deplete[idParent] != 0.); 62 | max_deplete = int(deplete[idParent]) + 2; 63 | int min_order_1 = int(order[idParent]) + int(deplete[idParent] != 0.) - 1; 64 | min_order = min_order_1 * int(min_order_1 >= 0); 65 | max_order = int(order[idParent]) + 1; 66 | } 67 | 68 | int opt_deplete = 0; 69 | int opt_order = 0; 70 | float opt_value = 0.; 71 | float state_value = 0.; 72 | 73 | 74 | for (int i = min_deplete; i < max_deplete; i++){ 75 | for (int j = min_order; j < max_order; j++){ 76 | state_value= 0.; 77 | for (int d = 0; d < max_demand; d++){ 78 | // decode idCurrent into idState 79 | int idSum= 0, index= idCurrent; 80 | for (int l = L - 1; l >= 0; l--) { 81 | idState[l] = index % K; 82 | idSum += idState[l]; 83 | index /= K; 84 | } 85 | idState[L] = 0; 86 | //deplete i units from idState 87 | int remain_deplete = i; 88 | for (int l = 0; l < L; l++) { 89 | if (remain_deplete <= idState[l]) { 90 | idState[l] -= remain_deplete; 91 | break; 92 | } else { 93 | remain_deplete -= idState[l]; 94 | idState[l] = 0; 95 | } 96 | } 97 | //holding cost incurred 98 | int hold = idSum - i; 99 | //order j units 100 | idState[L]= j; 101 | //sell d units from idState 102 | int sell = 0, remain_sell = d; 103 | for (int l = 0; l < L+ 1; l++) { 104 | if (remain_sell <= idState[l]) { 105 | sell += remain_sell; 106 | idState[l] -= remain_sell; 107 | break; 108 | } else { 109 | remain_sell -= idState[l]; 110 | sell += idState[l]; 111 | idState[l] = 0; 112 | } 113 | } 114 | //dispose expired terms 115 | int dispose = idState[0]; 116 | idState[0]= 0; 117 | //get the index of the future state 118 | int future = 0; 119 | for (int l = 1; l < L + 1; l++) { 120 | future *= K; 121 | future += idState[l]; 122 | } 123 | //get the value with respect to i, j, d 124 | float state_value_sample = salvageValue * i 125 | - holdingCost * hold 126 | + discountRate * (-orderCost * j 127 | + price * sell 128 | - disposalCost * dispose 129 | + inVector[future]); 130 | state_value += (state_value_sample * distribution[d]); 131 | } 132 | if (state_value > opt_value + 1e-6){ 133 | opt_value = state_value; 134 | opt_deplete = i; 135 | opt_order = j; 136 | } 137 | } 138 | } 139 | 140 | outVector[idCurrent] = opt_value; 141 | deplete[idCurrent] = float(opt_deplete); 142 | order[idCurrent] = float(opt_order); 143 | 144 | } 145 | 146 | // compute the value function of the fluid policy 147 | kernel void iterate_fluid(const device uint *batch[[buffer(4)]], 148 | const device float *parameters[[buffer(5)]], 149 | const device float *distribution[[buffer(6)]], 150 | const device float *inVector [[buffer(0)]], 151 | device float *outVector [[ buffer(1) ]], 152 | device float *deplete[[buffer(2)]], 153 | device float *order[[buffer(3)]], 154 | uint id [[ thread_position_in_grid ]]) { 155 | 156 | // get the parameters 157 | int K = int(parameters[0]), L = int(parameters[1]), numPeriods= int(parameters[10]); 158 | int max_demand = int(parameters[8]); 159 | int mean_demand = int(parameters[9]); 160 | float salvageValue = parameters[2]; 161 | float holdingCost = parameters[3]; 162 | float orderCost = parameters[4]; 163 | float disposalCost = parameters[5]; 164 | float discountRate = parameters[6]; 165 | float price = parameters[7]; 166 | 167 | int t = batch[2]; 168 | 169 | // find current and parend id 170 | uint idCurrent = batch[0]*batch[1]+id; 171 | 172 | // prepare a vector for decode 173 | int idState[max_dimension + 1]; 174 | 175 | int idSum= 0, index= idCurrent; 176 | for (int l = L - 1; l >= 0; l--) { 177 | idState[l] = index % K; 178 | idSum += idState[l]; 179 | index /= K; 180 | } 181 | idState[L] = 0; 182 | 183 | int fluid_deplete = 0, fluid_order = 0; 184 | // determine the action for a specific state 185 | if (t < numPeriods -1) { 186 | if (mean_demand - idSum > 1e-6){ 187 | fluid_order = int(mean_demand - idSum); 188 | } 189 | } 190 | else { 191 | int idPartialSum = 0; 192 | for (int l = 1; l < L + 1; l++) { 193 | idPartialSum += idState[l-1]; 194 | if (idPartialSum - l* mean_demand > fluid_deplete + 1e-6){ 195 | fluid_deplete = idPartialSum - l* mean_demand; 196 | } 197 | } 198 | if (mean_demand + fluid_deplete - idSum > 1e-6){ 199 | fluid_order = int(mean_demand + fluid_deplete - idSum); 200 | } 201 | 202 | } 203 | float opt_value = 0, state_value = 0; 204 | 205 | for (int i = fluid_deplete; i < fluid_deplete + 1; i++){ 206 | for (int j = fluid_order; j < fluid_order + 1; j++){ 207 | state_value= 0.; 208 | for (int d = 0; d < max_demand; d++){ 209 | // decode idCurrent into idState 210 | int idSum= 0, index= idCurrent; 211 | for (int l = L - 1; l >= 0; l--) { 212 | idState[l] = index % K; 213 | idSum += idState[l]; 214 | index /= K; 215 | } 216 | idState[L] = 0; 217 | //deplete i units from idState 218 | int remain_deplete = i; 219 | for (int l = 0; l < L; l++) { 220 | if (remain_deplete <= idState[l]) { 221 | idState[l] -= remain_deplete; 222 | break; 223 | } else { 224 | remain_deplete -= idState[l]; 225 | idState[l] = 0; 226 | } 227 | } 228 | //holding cost incurred 229 | int hold = idSum - i; 230 | //order j units 231 | idState[L]= j; 232 | //sell d units from idState 233 | int sell = 0, remain_sell = d; 234 | for (int l = 0; l < L+ 1; l++) { 235 | if (remain_sell <= idState[l]) { 236 | sell += remain_sell; 237 | idState[l] -= remain_sell; 238 | break; 239 | } else { 240 | remain_sell -= idState[l]; 241 | sell += idState[l]; 242 | idState[l] = 0; 243 | } 244 | } 245 | //dispose expired terms 246 | int dispose = idState[0]; 247 | idState[0]= 0; 248 | //get the index of the future state 249 | int future = 0; 250 | for (int l = 1; l < L + 1; l++) { 251 | future *= K; 252 | future += idState[l]; 253 | } 254 | //get the value with respect to i, j, d 255 | float state_value_sample = salvageValue * i 256 | - holdingCost * hold 257 | + discountRate * (-orderCost * j 258 | + price * sell 259 | - disposalCost * dispose 260 | + inVector[future]); 261 | state_value += (state_value_sample * distribution[d]); 262 | } 263 | if (state_value > opt_value + 1e-6){ 264 | opt_value = state_value; 265 | } 266 | } 267 | } 268 | 269 | outVector[idCurrent] = opt_value; 270 | deplete[idCurrent] = float(fluid_deplete); 271 | order[idCurrent] = float(fluid_order); 272 | 273 | } 274 | 275 | // compute the value function of the NVP policy 276 | kernel void iterate_NVP(const device uint *batch[[buffer(4)]], 277 | const device float *parameters[[buffer(5)]], 278 | const device float *distribution[[buffer(6)]], 279 | const device float *inVector [[buffer(0)]], 280 | device float *outVector [[ buffer(1) ]], 281 | device float *deplete[[buffer(2)]], 282 | device float *order[[buffer(3)]], 283 | uint id [[ thread_position_in_grid ]]) { 284 | 285 | // get the parameters 286 | int K = int(parameters[0]), L = int(parameters[1]), numPeriods= int(parameters[10]); 287 | int max_demand = int(parameters[8]); 288 | float salvageValue = parameters[2]; 289 | float holdingCost = parameters[3]; 290 | float orderCost = parameters[4]; 291 | float disposalCost = parameters[5]; 292 | float discountRate = parameters[6]; 293 | float price = parameters[7]; 294 | 295 | int t = batch[2]; 296 | 297 | // find current and parend id 298 | uint idCurrent = batch[0]*batch[1]+id; 299 | uint idParent = idCurrent - batch[0]; 300 | 301 | // prepare a vector for decode 302 | int idState[max_dimension + 1]; 303 | 304 | int idSum= 0, index= idCurrent; 305 | for (int l = L - 1; l >= 0; l--) { 306 | idState[l] = index % K; 307 | idSum += idState[l]; 308 | index /= K; 309 | } 310 | idState[L] = 0; 311 | 312 | int min_deplete = 0, max_deplete = 1; 313 | int min_order = 0, max_order = K; 314 | 315 | if (t < numPeriods- 1) { 316 | if (K-1 - idSum > 1e-6){ 317 | min_order = int(K-1 - idSum); 318 | max_order = int(K-1 - idSum) + 1; 319 | } 320 | } 321 | else { 322 | if (idCurrent != 0){ 323 | min_deplete = int(deplete[idParent]) + int(deplete[idParent] != 0.); 324 | max_deplete = int(deplete[idParent]) + 2; 325 | int min_order_1 = int(order[idParent]) + int(deplete[idParent] != 0.) - 1; 326 | min_order = min_order_1 * int(min_order_1 >= 0); 327 | max_order = int(order[idParent]) + 1; 328 | } 329 | } 330 | 331 | int NVP_deplete = 0; 332 | int NVP_order = 0; 333 | float opt_value = 0.; 334 | float state_value = 0.; 335 | 336 | for (int i = min_deplete; i < max_deplete; i++){ 337 | for (int j = min_order; j < max_order; j++){ 338 | state_value= 0.; 339 | for (int d = 0; d < max_demand; d++){ 340 | // decode idCurrent into idState 341 | int idSum= 0, index= idCurrent; 342 | for (int l = L - 1; l >= 0; l--) { 343 | idState[l] = index % K; 344 | idSum += idState[l]; 345 | index /= K; 346 | } 347 | idState[L] = 0; 348 | //deplete i units from idState 349 | int remain_deplete = i; 350 | for (int l = 0; l < L; l++) { 351 | if (remain_deplete <= idState[l]) { 352 | idState[l] -= remain_deplete; 353 | break; 354 | } else { 355 | remain_deplete -= idState[l]; 356 | idState[l] = 0; 357 | } 358 | } 359 | //holding cost incurred 360 | int hold = idSum - i; 361 | //order j units 362 | idState[L]= j; 363 | //sell d units from idState 364 | int sell = 0, remain_sell = d; 365 | for (int l = 0; l < L+ 1; l++) { 366 | if (remain_sell <= idState[l]) { 367 | sell += remain_sell; 368 | idState[l] -= remain_sell; 369 | break; 370 | } else { 371 | remain_sell -= idState[l]; 372 | sell += idState[l]; 373 | idState[l] = 0; 374 | } 375 | } 376 | //dispose expired terms 377 | int dispose = idState[0]; 378 | idState[0]= 0; 379 | //get the index of the future state 380 | int future = 0; 381 | for (int l = 1; l < L + 1; l++) { 382 | future *= K; 383 | future += idState[l]; 384 | } 385 | //get the value with respect to i, j, d 386 | float state_value_sample = salvageValue * i 387 | - holdingCost * hold 388 | + discountRate * (-orderCost * j 389 | + price * sell 390 | - disposalCost * dispose 391 | + inVector[future]); 392 | state_value += (state_value_sample * distribution[d]); 393 | } 394 | if (state_value > opt_value + 1e-6){ 395 | opt_value = state_value; 396 | NVP_deplete = i; 397 | NVP_order = j; 398 | } 399 | } 400 | } 401 | 402 | outVector[idCurrent] = opt_value; 403 | deplete[idCurrent] = float(NVP_deplete); 404 | order[idCurrent] = float(NVP_order); 405 | 406 | } 407 | 408 | // compute the value function of the NVP_variant policy 409 | kernel void iterate_NVP_1(const device uint *batch[[buffer(4)]], 410 | const device float *parameters[[buffer(5)]], 411 | const device float *distribution[[buffer(6)]], 412 | const device float *inVector [[buffer(0)]], 413 | device float *outVector [[ buffer(1) ]], 414 | device float *deplete[[buffer(2)]], 415 | device float *order[[buffer(3)]], 416 | uint id [[ thread_position_in_grid ]]) { 417 | 418 | // get the parameters 419 | int K = int(parameters[0]), L = int(parameters[1]); 420 | int max_demand = int(parameters[8]); 421 | float salvageValue = parameters[2]; 422 | float holdingCost = parameters[3]; 423 | float orderCost = parameters[4]; 424 | float disposalCost = parameters[5]; 425 | float discountRate = parameters[6]; 426 | float price = parameters[7]; 427 | 428 | int deplete_down_to = int(parameters[11]); 429 | 430 | // find current and parend id 431 | uint idCurrent = batch[0]*batch[1]+id; 432 | 433 | // prepare a vector for decode 434 | int idState[max_dimension + 1]; 435 | 436 | int idSum= 0, index= idCurrent; 437 | for (int l = L - 1; l >= 0; l--) { 438 | idState[l] = index % K; 439 | idSum += idState[l]; 440 | index /= K; 441 | } 442 | idState[L] = 0; 443 | 444 | int NVP_1_deplete = 0, NVP_1_order = 0; 445 | // determine the action for a specific state 446 | if (idState[0] > deplete_down_to) { 447 | NVP_1_deplete = idState[0]- deplete_down_to; 448 | } 449 | if (K-1 + NVP_1_deplete - idSum > 1e-6){ 450 | NVP_1_order = int(K- 1 + NVP_1_deplete - idSum); 451 | } 452 | 453 | 454 | float opt_value = 0, state_value = 0; 455 | 456 | for (int i = NVP_1_deplete; i < NVP_1_deplete + 1; i++){ 457 | for (int j = NVP_1_order; j < NVP_1_order + 1; j++){ 458 | state_value= 0.; 459 | for (int d = 0; d < max_demand; d++){ 460 | // decode idCurrent into idState 461 | int idSum= 0, index= idCurrent; 462 | for (int l = L - 1; l >= 0; l--) { 463 | idState[l] = index % K; 464 | idSum += idState[l]; 465 | index /= K; 466 | } 467 | idState[L] = 0; 468 | //deplete i units from idState 469 | int remain_deplete = i; 470 | for (int l = 0; l < L; l++) { 471 | if (remain_deplete <= idState[l]) { 472 | idState[l] -= remain_deplete; 473 | break; 474 | } else { 475 | remain_deplete -= idState[l]; 476 | idState[l] = 0; 477 | } 478 | } 479 | //holding cost incurred 480 | int hold = idSum - i; 481 | //order j units 482 | idState[L]= j; 483 | //sell d units from idState 484 | int sell = 0, remain_sell = d; 485 | for (int l = 0; l < L+ 1; l++) { 486 | if (remain_sell <= idState[l]) { 487 | sell += remain_sell; 488 | idState[l] -= remain_sell; 489 | break; 490 | } else { 491 | remain_sell -= idState[l]; 492 | sell += idState[l]; 493 | idState[l] = 0; 494 | } 495 | } 496 | //dispose expired terms 497 | int dispose = idState[0]; 498 | idState[0]= 0; 499 | //get the index of the future state 500 | int future = 0; 501 | for (int l = 1; l < L + 1; l++) { 502 | future *= K; 503 | future += idState[l]; 504 | } 505 | //get the value with respect to i, j, d 506 | float state_value_sample = salvageValue * i 507 | - holdingCost * hold 508 | + discountRate * (-orderCost * j 509 | + price * sell 510 | - disposalCost * dispose 511 | + inVector[future]); 512 | state_value += (state_value_sample * distribution[d]); 513 | } 514 | if (state_value > opt_value + 1e-6){ 515 | opt_value = state_value; 516 | } 517 | } 518 | } 519 | 520 | outVector[idCurrent] = opt_value; 521 | deplete[idCurrent] = float(NVP_1_deplete); 522 | order[idCurrent] = float(NVP_1_order); 523 | 524 | } 525 | 526 | 527 | // to test the understanding of thread related concepts 528 | kernel void testThread(device float *result [[ buffer(0) ]], 529 | uint id [[ thread_position_in_grid ]], 530 | uint i [[ thread_position_in_threadgroup ]], 531 | uint w [[ threadgroup_position_in_grid ]], 532 | uint S [[ threads_per_threadgroup ]]) { 533 | 534 | if (id == w*S+i) 535 | result[id] = id; 536 | else 537 | result[id] = 0; 538 | } 539 | --------------------------------------------------------------------------------