├── images
└── lousy_doc.png
├── ParallelDP.xcodeproj
├── project.xcworkspace
│ └── contents.xcworkspacedata
├── xcuserdata
│ └── jz.xcuserdatad
│ │ └── xcschemes
│ │ ├── xcschememanagement.plist
│ │ └── ParallelDP.xcscheme
└── project.pbxproj
├── ParallelDP
├── parameters.plist
├── main.swift
└── Shaders.metal
└── readme.md
/images/lousy_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reijz/parallel-computing_Swift_Metal/HEAD/images/lousy_doc.png
--------------------------------------------------------------------------------
/ParallelDP.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ParallelDP.xcodeproj/xcuserdata/jz.xcuserdatad/xcschemes/xcschememanagement.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | SchemeUserState
6 |
7 | ParallelDP.xcscheme
8 |
9 | orderHint
10 | 0
11 |
12 |
13 | SuppressBuildableAutocreation
14 |
15 | 2A0EA2641C362F8A003BF0CF
16 |
17 | primary
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/ParallelDP/parameters.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ThreadExecutionWidth
6 | 512
7 | Capacity
8 | 24
9 | Dimension
10 | 4
11 | Mean_demand
12 | 20
13 | Deplete_threshold
14 | 20
15 | Periods
16 | 20
17 | HoldingCost
18 | 1.5
19 | SalvageValue
20 | 1
21 | OrderCost
22 | 5
23 | DisposalCost
24 | 1
25 | DiscountRate
26 | 1
27 | Price
28 | 10
29 | Distribution
30 |
31 | 2.06115362e-09
32 | 4.12230724e-08
33 | 4.12230724e-07
34 | 2.74820483e-06
35 | 1.37410241e-05
36 | 5.49640966e-05
37 | 0.000183213655
38 | 0.000523467587
39 | 0.00130866897
40 | 0.00290815326
41 | 0.00581630652
42 | 0.0105751028
43 | 0.0176251713
44 | 0.0271156481
45 | 0.0387366401
46 | 0.0516488535
47 | 0.06456106690000001
48 | 0.07595419639999999
49 | 0.0843935515
50 | 0.0888353174
51 | 0.0888353174
52 | 0.0846050642
53 | 0.07691369469999999
54 | 0.0668814737
55 | 0.0557345614
56 | 0.0445876491
57 | 0.0342981916
58 | 0.0254060679
59 | 0.0181471913
60 | 0.0125153044
61 | 0.00834353625
62 | 0.00538292661
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/ParallelDP.xcodeproj/xcuserdata/jz.xcuserdatad/xcschemes/ParallelDP.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
8 |
9 |
15 |
21 |
22 |
23 |
24 |
25 |
30 |
31 |
32 |
33 |
39 |
40 |
41 |
42 |
43 |
44 |
54 |
56 |
62 |
63 |
64 |
65 |
66 |
67 |
73 |
75 |
81 |
82 |
83 |
84 |
86 |
87 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/ParallelDP.xcodeproj/project.pbxproj:
--------------------------------------------------------------------------------
1 | // !$*UTF8*$!
2 | {
3 | archiveVersion = 1;
4 | classes = {
5 | };
6 | objectVersion = 46;
7 | objects = {
8 |
9 | /* Begin PBXBuildFile section */
10 | 2A0EA2691C362F8A003BF0CF /* main.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2A0EA2681C362F8A003BF0CF /* main.swift */; };
11 | 2A0EA2701C3654EE003BF0CF /* Shaders.metal in Sources */ = {isa = PBXBuildFile; fileRef = 2A0EA26F1C3654EE003BF0CF /* Shaders.metal */; };
12 | /* End PBXBuildFile section */
13 |
14 | /* Begin PBXCopyFilesBuildPhase section */
15 | 2A0EA2631C362F8A003BF0CF /* CopyFiles */ = {
16 | isa = PBXCopyFilesBuildPhase;
17 | buildActionMask = 2147483647;
18 | dstPath = /usr/share/man/man1/;
19 | dstSubfolderSpec = 0;
20 | files = (
21 | );
22 | runOnlyForDeploymentPostprocessing = 1;
23 | };
24 | /* End PBXCopyFilesBuildPhase section */
25 |
26 | /* Begin PBXFileReference section */
27 | 2A0EA2651C362F8A003BF0CF /* ParallelDP */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = ParallelDP; sourceTree = BUILT_PRODUCTS_DIR; };
28 | 2A0EA2681C362F8A003BF0CF /* main.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = main.swift; sourceTree = ""; };
29 | 2A0EA26F1C3654EE003BF0CF /* Shaders.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shaders.metal; sourceTree = ""; };
30 | 2A6953E51C5793BE00C77AD8 /* parameters.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = parameters.plist; sourceTree = ""; };
31 | /* End PBXFileReference section */
32 |
33 | /* Begin PBXFrameworksBuildPhase section */
34 | 2A0EA2621C362F8A003BF0CF /* Frameworks */ = {
35 | isa = PBXFrameworksBuildPhase;
36 | buildActionMask = 2147483647;
37 | files = (
38 | );
39 | runOnlyForDeploymentPostprocessing = 0;
40 | };
41 | /* End PBXFrameworksBuildPhase section */
42 |
43 | /* Begin PBXGroup section */
44 | 2A0EA25C1C362F8A003BF0CF = {
45 | isa = PBXGroup;
46 | children = (
47 | 2A0EA2671C362F8A003BF0CF /* ParallelDP */,
48 | 2A0EA2661C362F8A003BF0CF /* Products */,
49 | );
50 | sourceTree = "";
51 | };
52 | 2A0EA2661C362F8A003BF0CF /* Products */ = {
53 | isa = PBXGroup;
54 | children = (
55 | 2A0EA2651C362F8A003BF0CF /* ParallelDP */,
56 | );
57 | name = Products;
58 | sourceTree = "";
59 | };
60 | 2A0EA2671C362F8A003BF0CF /* ParallelDP */ = {
61 | isa = PBXGroup;
62 | children = (
63 | 2A0EA2681C362F8A003BF0CF /* main.swift */,
64 | 2A0EA26F1C3654EE003BF0CF /* Shaders.metal */,
65 | 2A6953E51C5793BE00C77AD8 /* parameters.plist */,
66 | );
67 | path = ParallelDP;
68 | sourceTree = "";
69 | };
70 | /* End PBXGroup section */
71 |
72 | /* Begin PBXNativeTarget section */
73 | 2A0EA2641C362F8A003BF0CF /* ParallelDP */ = {
74 | isa = PBXNativeTarget;
75 | buildConfigurationList = 2A0EA26C1C362F8A003BF0CF /* Build configuration list for PBXNativeTarget "ParallelDP" */;
76 | buildPhases = (
77 | 2A0EA2611C362F8A003BF0CF /* Sources */,
78 | 2A0EA2621C362F8A003BF0CF /* Frameworks */,
79 | 2A0EA2631C362F8A003BF0CF /* CopyFiles */,
80 | );
81 | buildRules = (
82 | );
83 | dependencies = (
84 | );
85 | name = ParallelDP;
86 | productName = ParallelDP;
87 | productReference = 2A0EA2651C362F8A003BF0CF /* ParallelDP */;
88 | productType = "com.apple.product-type.tool";
89 | };
90 | /* End PBXNativeTarget section */
91 |
92 | /* Begin PBXProject section */
93 | 2A0EA25D1C362F8A003BF0CF /* Project object */ = {
94 | isa = PBXProject;
95 | attributes = {
96 | LastSwiftUpdateCheck = 0720;
97 | LastUpgradeCheck = 0720;
98 | ORGANIZATIONNAME = verse;
99 | TargetAttributes = {
100 | 2A0EA2641C362F8A003BF0CF = {
101 | CreatedOnToolsVersion = 7.2;
102 | };
103 | };
104 | };
105 | buildConfigurationList = 2A0EA2601C362F8A003BF0CF /* Build configuration list for PBXProject "ParallelDP" */;
106 | compatibilityVersion = "Xcode 3.2";
107 | developmentRegion = English;
108 | hasScannedForEncodings = 0;
109 | knownRegions = (
110 | en,
111 | );
112 | mainGroup = 2A0EA25C1C362F8A003BF0CF;
113 | productRefGroup = 2A0EA2661C362F8A003BF0CF /* Products */;
114 | projectDirPath = "";
115 | projectRoot = "";
116 | targets = (
117 | 2A0EA2641C362F8A003BF0CF /* ParallelDP */,
118 | );
119 | };
120 | /* End PBXProject section */
121 |
122 | /* Begin PBXSourcesBuildPhase section */
123 | 2A0EA2611C362F8A003BF0CF /* Sources */ = {
124 | isa = PBXSourcesBuildPhase;
125 | buildActionMask = 2147483647;
126 | files = (
127 | 2A0EA2701C3654EE003BF0CF /* Shaders.metal in Sources */,
128 | 2A0EA2691C362F8A003BF0CF /* main.swift in Sources */,
129 | );
130 | runOnlyForDeploymentPostprocessing = 0;
131 | };
132 | /* End PBXSourcesBuildPhase section */
133 |
134 | /* Begin XCBuildConfiguration section */
135 | 2A0EA26A1C362F8A003BF0CF /* Debug */ = {
136 | isa = XCBuildConfiguration;
137 | buildSettings = {
138 | ALWAYS_SEARCH_USER_PATHS = NO;
139 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
140 | CLANG_CXX_LIBRARY = "libc++";
141 | CLANG_ENABLE_MODULES = YES;
142 | CLANG_ENABLE_OBJC_ARC = YES;
143 | CLANG_WARN_BOOL_CONVERSION = YES;
144 | CLANG_WARN_CONSTANT_CONVERSION = YES;
145 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
146 | CLANG_WARN_EMPTY_BODY = YES;
147 | CLANG_WARN_ENUM_CONVERSION = YES;
148 | CLANG_WARN_INT_CONVERSION = YES;
149 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
150 | CLANG_WARN_UNREACHABLE_CODE = YES;
151 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
152 | CODE_SIGN_IDENTITY = "-";
153 | COPY_PHASE_STRIP = NO;
154 | DEBUG_INFORMATION_FORMAT = dwarf;
155 | ENABLE_STRICT_OBJC_MSGSEND = YES;
156 | ENABLE_TESTABILITY = YES;
157 | GCC_C_LANGUAGE_STANDARD = gnu99;
158 | GCC_DYNAMIC_NO_PIC = NO;
159 | GCC_NO_COMMON_BLOCKS = YES;
160 | GCC_OPTIMIZATION_LEVEL = 0;
161 | GCC_PREPROCESSOR_DEFINITIONS = (
162 | "DEBUG=1",
163 | "$(inherited)",
164 | );
165 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
166 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
167 | GCC_WARN_UNDECLARED_SELECTOR = YES;
168 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
169 | GCC_WARN_UNUSED_FUNCTION = YES;
170 | GCC_WARN_UNUSED_VARIABLE = YES;
171 | MACOSX_DEPLOYMENT_TARGET = 10.11;
172 | MTL_ENABLE_DEBUG_INFO = YES;
173 | ONLY_ACTIVE_ARCH = YES;
174 | SDKROOT = macosx;
175 | SWIFT_OPTIMIZATION_LEVEL = "-Onone";
176 | };
177 | name = Debug;
178 | };
179 | 2A0EA26B1C362F8A003BF0CF /* Release */ = {
180 | isa = XCBuildConfiguration;
181 | buildSettings = {
182 | ALWAYS_SEARCH_USER_PATHS = NO;
183 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
184 | CLANG_CXX_LIBRARY = "libc++";
185 | CLANG_ENABLE_MODULES = YES;
186 | CLANG_ENABLE_OBJC_ARC = YES;
187 | CLANG_WARN_BOOL_CONVERSION = YES;
188 | CLANG_WARN_CONSTANT_CONVERSION = YES;
189 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
190 | CLANG_WARN_EMPTY_BODY = YES;
191 | CLANG_WARN_ENUM_CONVERSION = YES;
192 | CLANG_WARN_INT_CONVERSION = YES;
193 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
194 | CLANG_WARN_UNREACHABLE_CODE = YES;
195 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
196 | CODE_SIGN_IDENTITY = "-";
197 | COPY_PHASE_STRIP = NO;
198 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
199 | ENABLE_NS_ASSERTIONS = NO;
200 | ENABLE_STRICT_OBJC_MSGSEND = YES;
201 | GCC_C_LANGUAGE_STANDARD = gnu99;
202 | GCC_NO_COMMON_BLOCKS = YES;
203 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
204 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
205 | GCC_WARN_UNDECLARED_SELECTOR = YES;
206 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
207 | GCC_WARN_UNUSED_FUNCTION = YES;
208 | GCC_WARN_UNUSED_VARIABLE = YES;
209 | MACOSX_DEPLOYMENT_TARGET = 10.11;
210 | MTL_ENABLE_DEBUG_INFO = NO;
211 | SDKROOT = macosx;
212 | };
213 | name = Release;
214 | };
215 | 2A0EA26D1C362F8A003BF0CF /* Debug */ = {
216 | isa = XCBuildConfiguration;
217 | buildSettings = {
218 | PRODUCT_NAME = "$(TARGET_NAME)";
219 | };
220 | name = Debug;
221 | };
222 | 2A0EA26E1C362F8A003BF0CF /* Release */ = {
223 | isa = XCBuildConfiguration;
224 | buildSettings = {
225 | PRODUCT_NAME = "$(TARGET_NAME)";
226 | };
227 | name = Release;
228 | };
229 | /* End XCBuildConfiguration section */
230 |
231 | /* Begin XCConfigurationList section */
232 | 2A0EA2601C362F8A003BF0CF /* Build configuration list for PBXProject "ParallelDP" */ = {
233 | isa = XCConfigurationList;
234 | buildConfigurations = (
235 | 2A0EA26A1C362F8A003BF0CF /* Debug */,
236 | 2A0EA26B1C362F8A003BF0CF /* Release */,
237 | );
238 | defaultConfigurationIsVisible = 0;
239 | defaultConfigurationName = Release;
240 | };
241 | 2A0EA26C1C362F8A003BF0CF /* Build configuration list for PBXNativeTarget "ParallelDP" */ = {
242 | isa = XCConfigurationList;
243 | buildConfigurations = (
244 | 2A0EA26D1C362F8A003BF0CF /* Debug */,
245 | 2A0EA26E1C362F8A003BF0CF /* Release */,
246 | );
247 | defaultConfigurationIsVisible = 0;
248 | defaultConfigurationName = Release;
249 | };
250 | /* End XCConfigurationList section */
251 | };
252 | rootObject = 2A0EA25D1C362F8A003BF0CF /* Project object */;
253 | }
254 |
--------------------------------------------------------------------------------
/ParallelDP/main.swift:
--------------------------------------------------------------------------------
1 | //
2 | // main.swift
3 | // ParallelDP
4 | //
5 | // Created by Jiheng Zhang on 1/1/2016.
6 | // Copyright © 2016 verse. All rights reserved.
7 | //
8 |
9 | import Foundation
10 | import MetalKit
11 |
12 | let fileManager = NSFileManager.defaultManager()
13 |
14 | // Specify the plist file while running
15 | let path = fileManager.currentDirectoryPath
16 | let args = Process.arguments
17 | if (args.count != 2) {
18 | print("Please specify the plist file for paremeters!")
19 | exit(1)
20 | }
21 | let plistPath = path + "/" + args[1]
22 | //print(plistPath)
23 |
24 | if !fileManager.fileExistsAtPath(plistPath) {
25 | print("Cannot find plist file!")
26 | exit(1)
27 | }
28 |
29 | // Reading paremeters from plist
30 | let dict = NSDictionary(contentsOfFile: plistPath)
31 | // print(dict)
32 | let numPeriods: Int! = dict!.valueForKey("Periods") as? Int
33 | let mean_demand: Int! = dict!.valueForKey("Mean_demand") as? Int
34 | let deplete_threshold: Int! = dict!.valueForKey("Deplete_threshold") as? Int
35 | let L: Int! = dict!.valueForKey("Dimension") as? Int
36 | let K: Int! = dict!.valueForKey("Capacity") as? Int
37 | let holdingCost: Float! = dict!.valueForKey("HoldingCost") as? Float
38 | let salvageValue: Float! = dict!.valueForKey("SalvageValue") as? Float
39 | let orderCost: Float! = dict!.valueForKey("OrderCost") as? Float
40 | let disposalCost: Float! = dict!.valueForKey("DisposalCost") as? Float
41 | let discountRate: Float! = dict!.valueForKey("DiscountRate") as? Float
42 | let price: Float! = dict!.valueForKey("Price") as? Float
43 | let dist: [Float]! = dict!.valueForKey("Distribution") as? [Float]
44 | // Allow user to specify threadExecutionWidth depending on their own hardware
45 | let threadExecutionWidth: Int! = dict!.valueForKey("ThreadExecutionWidth") as? Int
46 |
47 | print("The complexity is with Capacity", K, "and Dimension", L)
48 |
49 | let max_demand: Float = Float(dist.count)
50 | // The order matters
51 | let paramemterVector: [Float] = [
52 | Float(K),
53 | Float(L),
54 | salvageValue,
55 | holdingCost,
56 | orderCost,
57 | disposalCost,
58 | discountRate,
59 | price,
60 | max_demand,
61 | Float(mean_demand),
62 | Float(numPeriods),
63 | Float(deplete_threshold)
64 | ]
65 |
66 | // basic calcuation of buffer
67 | let numberOfStates = Int(pow(Double(K), Double(L)))
68 | let unitSize = sizeof(Float)
69 | let resultBufferSize = numberOfStates*unitSize
70 |
71 | // basic calculation of device related parameter
72 | let numThreadsPerGroup = MTLSize(width:threadExecutionWidth,height:1,depth:1)
73 |
74 | // Initialize Metal
75 | // Get the default device, which is the same as the one monitor is using
76 | var device: MTLDevice! = MTLCreateSystemDefaultDevice()
77 | // In the following, choose the device NOT used by monitor
78 | let devices: [MTLDevice] = MTLCopyAllDevices()
79 | for metalDevice: MTLDevice in devices {
80 | if metalDevice.headless == true {
81 | device = metalDevice
82 | }
83 | }
84 | // exit with an error message if all devices are used by monitor
85 | if !device.headless {
86 | print("no dedicated device found")
87 | exit(1)
88 | }
89 |
90 | // Build command queue
91 | var commandQueue: MTLCommandQueue! = device.newCommandQueue()
92 |
93 | // Allocate memory on device
94 | let resourceOption = MTLResourceOptions()
95 | var buffer:[MTLBuffer] = [
96 | device.newBufferWithLength(resultBufferSize, options: resourceOption),
97 | device.newBufferWithLength(resultBufferSize, options: resourceOption),
98 | device.newBufferWithLength(resultBufferSize, options: resourceOption), // depletion action
99 | device.newBufferWithLength(resultBufferSize, options: resourceOption) // order action
100 | ]
101 | var parameterBuffer:MTLBuffer = device.newBufferWithBytes(paramemterVector, length: unitSize*paramemterVector.count, options: resourceOption)
102 | // put distriburion buffer here
103 | var distributionBuffer:MTLBuffer = device.newBufferWithBytes(dist, length: unitSize*dist.count, options: resourceOption)
104 |
105 | // Get functions from Shaders and add to MTL library
106 | var DPLibrary: MTLLibrary! = device.newDefaultLibrary()
107 | let initDP = DPLibrary.newFunctionWithName("initialize")
108 | let pipelineFilterInit = try device.newComputePipelineStateWithFunction(initDP!)
109 | let iterateDP = DPLibrary.newFunctionWithName("iterate")
110 | let pipelineFilterIterate = try device.newComputePipelineStateWithFunction(iterateDP!)
111 | let iterateFluid = DPLibrary.newFunctionWithName("iterate_fluid")
112 | let pipelineFilterIterate_fluid = try device.newComputePipelineStateWithFunction(iterateFluid!)
113 | let iterateNVP = DPLibrary.newFunctionWithName("iterate_NVP")
114 | let pipelineFilterIterate_NVP = try device.newComputePipelineStateWithFunction(iterateNVP!)
115 | let iterateNVP_1 = DPLibrary.newFunctionWithName("iterate_NVP_1")
116 | let pipelineFilterIterate_NVP_1 = try device.newComputePipelineStateWithFunction(iterateNVP_1!)
117 |
118 | var start = NSDate()
119 | // Initialize
120 | for l: Int in 1...L {
121 |
122 | let batchSize:uint = uint(pow(Float(K),Float(l-1)))
123 | let numGroupsBatch = MTLSize(width:(Int(batchSize)+threadExecutionWidth-1)/threadExecutionWidth, height:1, depth:1)
124 | // print("Batch Size = ", batchSize)
125 |
126 | for batchIndex: uint in 1.. It was written on 2016-02-08, and posted on my jekyll site. Since I have retired my jekyll site, better to keep a copy here.
4 |
5 |
6 | In 4 weeks of part-time effort, with zero knowledge in Swift/Metal but the help of the Internet and curiosity, I evolved a [sample code](http://memkite.com/blog/2014/12/15/data-parallel-programming-with-metal-and-swift-for-iphoneipad-gpu) found on the Internet into a full feature and easy-to-use and -modify code for one of my research problems requiring large scale parallel computing. Please feel free to check out the Swift/Metal code from [my github repository](https://github.com/reijz/ParallelComputing-Swift-Metal) and let me know any feedback you may have.
7 |
8 | ## How did it get started
9 |
10 | Recently my colleagues and I have been working on *approximate dynamic programming* with applications in perishable inventory management. At some point, we needed to perform numerical experiments to demonstrate the nice theories and approximation algorithms we created to solve the problem (at least approximately).
11 |
12 | The computing task in a nut shell, putting aside the mathematics, is to initialize a large array of size $K^L$ (e.g., $K=9$, $L=8$) and then iterate the array according to some rule for a few (say $T$) times. **The initialization and each iteration can be done in parallel**.
13 |
14 | Nvidia has made a lot of buzz in the field of parallel computing (or Artificial Intelligence, Deep Learning, Neural-Network, or whatever buzz word they call it) by utilizing large number (about 2000) of computing cores in their video cards. Though I haven't coded using C/C++ for several years, I know the existence of parallel computing platforms such as CUDA and OpenCL, with the former making much louder sound.
15 |
16 | In the spring of 2015, I hired two smart kids major in CS and ECE on our campus to write a computer program for our mathematical model using CUDA. After several months of communication involving my PhD student (who knows deep in the math but not much on coding), the two programmers and myself, we got a working code in CUDA which runs blazing fast if you imagine how long such computing task would take with a traditional single thread CPU computing.
17 |
18 |
19 | ## An adventure
20 |
21 | However, the CUDA code was not written in a clean way making further changes quite difficult. As the research on theories develops, we need modification on the algorithms from time to time, but programmers will never be so freely available. More importantly, I got curious about the parallel computing, and wanted to know what's going on there. I managed to hire one of the programmers to continue work for me in the summer of 2015, as the other got an intern job at Tensent, a tech giant in China. We aimed to create a clean version of code using OpenCL as I have a top configured MacPro (2013, a.k.a. "trash can"), which does not support CUDA. We failed that task. One reason is that OpenCL is indeed a lot harder to use than CUDA, although it can harness both CPU and GPU power. This is how [Metal](https://developer.apple.com/metal/), an OpenGL/CL replacement on Mac OS and much more beyond in my view, comes into play. The project turns out to be a nice adventure.
22 |
23 | > Things are NOT as difficult as they appear to be!
24 |
25 | We are probably living in the best age in the human history, despite pollution, inequality, poverty, etc., which anyway exist all time. The Internet and curiosity can teach you pretty much everything and enable you to get things done. The following are the main steps I took in this effort.
26 |
27 | 1. Naturally, I started by reading Apple's [Metal Programming Guide](https://developer.apple.com/library/ios/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html), which was not a terribly long document, but fun and easy to read. Essentially, one just needs to focus on the chapter "Data-Parallel Compute Processing: Compute Command Encoder", which clearly instructed a standard procedure to do parallel computing. The document did not provide any sample code, and I happen to start with absolutely zero knowledge in Swift. So I found a swift [programming course](https://www.coursera.org/learn/swift-programming) on Coursera and went through the first three weeks' content. I also skimmed through the [The Swift Programming Language](https://swift.org/documentation/TheSwiftProgrammingLanguage(Swift2.2).epub) book published by Apple.
28 |
29 | 2. At the turn of 2015 to 2016, only a handful of related results can be found by Googling keywords like "parallel computing swift metal". The article written by [Amund Tveit](https://www.linkedin.com/in/amundtveit) a year ago, [Data-Parallel Programming with Metal and Swift for iPhone/iPad GPU](http://memkite.com/blog/2014/12/15/data-parallel-programming-with-metal-and-swift-for-iphoneipad-gpu), is particularly helpful. With the help of the official [Metal Programming Guide](https://developer.apple.com/library/ios/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html) and some googling, I managed to compile and run Amund's sample code with some [minor tweaks](### Swift is a developing language).
30 |
31 | The code basically utilizes Swift/Metal to compute the Sigmund function for a large number of input values in parallel. However, this is far from what I need, since my algorithm requires full control of assigning computing cores to individual elements in the array and more.
32 |
33 | 3. I found a nice [CUDA course](https://www.coursera.org/course/hetero) on Coursera and studied the first five weeks' content since Apple's [Metal Programming Guide](https://developer.apple.com/library/ios/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html) stated the prerequisites "You should be familiar with the Objective-C language and experienced in programming with OpenGL, OpenCL, or similar APIs."
34 | Despite all the grammar and formality, the key thing I learned from this course is "linearity", or how to assign computing tasks to cores. This is exactly what I needed to know.
35 | With this knowledge, I studied carefully the CUDA code written by my programmers.
36 |
37 | 4. Metal seems to take quite a different approach than CUDA. For example, CUDA has three function type qualifiers, `__divice__`, `__global__` and `__host__`, which determines where the function can be called and executed. This enables quite some freedom in terms of structuring the code and designing the algorithm. I am sure the real "Kongfu" masters can write beautiful code with such freedom. But for less talented general public, this seems overly complicated and may make code hard to read/debug/modify. Briefly, `__Global__` functions can be called from the host side using CUDA kernel call semantics `<<>>`, while `__device__` functions can only be called from other device or global functions. The `grid_dim` and `block_dim` help to specific which cores compute which data.
38 |
39 | The Metal framework seems to build a "wall" between the device and host. Those functions on device are written in Objective C (with some limitations such as dynamic memory allocation is forbidden) while functions on host are written in Swift [^objectivec]. You don't **call** a device function, you only **schedule** the device function.
40 | For the device functions, there are some built-in input arguments, such as
41 |
42 | | Attribute Qualifier |
43 | | ------------- |
44 | | `[[ thread_position_in_grid ]]` |
45 | | `[[ thread_position_in_threadgroup ]]` |
46 | | `[[ thread_index_in_threadgroup ]]` |
47 | | `[[ threadgroup_position_in_grid ]]` |
48 | | `[[ threads_per_grid ]]` |
49 | | `[[ threads_per_threadgroup ]]` |
50 | | `[[ threadgroups_per_grid ]]` |
51 | | `[[ thread_execution_width ]]` |
52 |
53 | In other words, you never need to explicitly transmit the information about grid dimension and block dimension. I wrote the following code in order to test my understanding.
54 |
55 | Another disadvantage of such a "wall" is to make transmission of parameters inconvenient. And you certainly do not want to specify a set of parameters twice in both the Objective C for device and Swift for host. A workaround is to use shared buffer to transmit parameters. See my [code](https://github.com/reijz/ParallelComputing-Swift-Metal) for example.
56 |
57 | [^objectivec]: For those who prefer to use Objective C over Swift, they can still write function on host using Objective C. But the wall still exists.
58 |
59 |
60 | ```swift
61 | // to test the understanding of thread related concepts
62 | kernel void testThread(device float *result [[ buffer(0) ]],
63 | uint id [[ thread_position_in_grid ]],
64 | uint i [[ thread_position_in_threadgroup ]],
65 | uint w [[ threadgroup_position_in_grid ]],
66 | uint S [[ threads_per_threadgroup ]]) {
67 |
68 | if (id == w*S+i)
69 | result[id] = id;
70 | else
71 | result[id] = 0;
72 | }
73 | ```
74 |
75 |
76 |
77 | ## Beyond this project
78 |
79 | Well, I got a nice code for my research project. And my PhD student is using it to do extensive numerical experiments on my Mac Pro. On the way, I picked up a new programming language and learned a lot more. I also involved my PhD student in this adventure. Now he seems feel more comfortable with coding.
80 |
81 | More importantly, this adventure triggers some thoughts on various related issues. I would like to share one, that is **knowing more enables you to see things clearly**.
82 |
83 | Coming back to Nvidia, "the" major player in parallel computing, they seem to have a line of both hardware and software products. Tesla K40 GPU seems to be a popular choice for high performance computing. As of January 2016, I got the following estimated price quote
84 |
85 |
86 | | |GPU Compute Node|Top configured MacPro|
87 | | ------------- | ------------- | ------------- |
88 | | Price | 11,000 USD | 9,000 USD |
89 | | CPU | 2 x Intel Xeon E5-2670 | 1 x Intel Xeon E5-2697 |
90 | | Memory | 64G | 64G |
91 | | GPU | 2 x Nvidia Tesla K40 | 2 x AMD FirePro D700 |
92 | | Hard disk | 1.2 TB 10K rpm | 1 TB SSD |
93 | | Something I don't know | 2 x GE network interfaces | |
94 | | Something I don't know | 1 x 56Gb Infiniband adapter | |
95 |
96 | **The notoriously expensive Apple product is cheaper by 20%.** Ok, the MacPro only has one Intel Xeon E5 (12-core) processor instead of two, but it has 1TB flash drive instead of traditional spinning hard disk. Anyway, it's fair to say these two configurations are comparable.
97 |
98 | I know you may want to talk about the **performance**. Since I have both CUDA and Swift/Metal codes, I did the comparison based on computing task for my research problem. For exactly the same set of [parameters]({{ site.url }}{{ site.asset }}/CUDA/parameters.plist), the following table shows the computing time (in seconds) on our university's Tesla K40 server and my MacPro, both using only a single GPU and float precision (instead of double precision).
99 |
100 | | `threadExecutionWidth` | Nvidia Tesla M2090| Nvidia Tesla K20 | Nvidia Tesla K40 | AMD FirePro D700 on MacPro | NVIDIA GeForce GT 650M on MacBook Pro (2012)|
101 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
102 | | 1 | 446 | 2451 | | 728 | NA |
103 | | 2 | 259 | 1406 | | 403 | NA |
104 | | 4 | 166 | 867 | | 217 | NA |
105 | | 8 | 124 | 307 | | 123 | NA |
106 | | 16 | 105 | 332 | | 87 | 245 |
107 | | 32 | **99** | 212 | | 99 | 131 |
108 | | 64 | 109 | 215 | | 106 | 88 |
109 | | 128 | 108 | **214** | | 106 | **79** |
110 | | 256 | 110 | **214** | | 107 | 81 |
111 | | 512 | 111 | **214** | | 108 | 84 |
112 | | 1024 | NA[^thread] | 215 | | 104 | 99 |
113 | | 2048 | NA | NA | | **87.36** | NA |
114 |
115 | The `threadExecutionWidth` is set in CUDA
116 |
117 | ```c++
118 | *block_dim = dim3(width:threadExecutionWidth, 1, 1);
119 | ```
120 |
121 | and is set in Swift/Metal
122 |
123 | ```swift
124 | let numThreadsPerGroup = MTLSize(width:threadExecutionWidth,height:1,depth:1)
125 | ```
126 |
127 | As of writing this article, I still do not understand how would the `threadExecutionWidth` affect the speed. In theory, it should not. But the little experiment I did in the above table show the other way. And it is not even monotone, larger `threadExecutionWidth` faster speed, as generally perceived.
128 |
129 | [^thread]: My programmer told me that `threadExecutionWidth` cannot be higher than 512 on Tesla M2090. [http://stackoverflow.com/questions/5062781/cuda-max-threads-in-a-block](http://stackoverflow.com/questions/5062781/cuda-max-threads-in-a-block) And Indeed, setting an NA `threadExecutionWidth`, the CUDA and swift code will compile and run, but yield completely wrong results.
130 |
131 |
132 | Yes, my **MacPro is about 10% faster than Tesla M2090 GPU server, and much fast than K20.** In addition to performing the parallel computing task, my MacPro serves as github and VPN server for my whole research team. My team also run traditional single thread Python and Java code on this machine quite frequently. Moreover, I use it as a day-to-day computer, typing this article for example. Don't forget that I could do professional video/photo/sound editing and play stunning 3D game on this machine had I have that skill set. As a final remark, my MacPro, with only 44W when Idle and 270W at CPU Max[^power], is a lot more energy efficient than most servers on the market.
133 |
134 | However, it is surprising that the 2,500 USD MacBook Pro 15" purchased in 2012 is the fastest in this test! There are simply many things we, as general users of those parellel computing platform, don't know.
135 |
136 | [^power]: The MacPro power consumption can be found at [https://support.apple.com/en-us/HT201796](https://support.apple.com/en-us/HT201796)
137 |
138 | I am not surprised that Nvidia loses to Apple on scientific parallel computing using GPU, which is not a focus of Apple. Here is a screenshot I took while browsing the official [CUDA Document](http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3zfjtApED).
139 |
140 | 
141 |
142 | Did you notice the red line under those words like "Quadro" and "Mathematica"? The table is a picture, presumably prepared using something like Word which highlights any misspelling. What an irresponsible team would do such a lousy job! I have problem believing how well things can be designed and optimized underneath the hood of the CUDA system. It seems Nvidia focus too much on marketing, since I've heard so much about it while its document is so lousy. You may take a look at the Apple document, e.g., [Metal Shading Language Guide](https://developer.apple.com/library/ios/documentation/Metal/Reference/MetalShadingLanguageGuide/Introduction/Introduction.html), just for comparison. I don't mean that giving the document a face lift would make the product great. What I mean is that we need to focus on making truely great stuff in every aspect we could possibly think of.
--------------------------------------------------------------------------------
/ParallelDP/Shaders.metal:
--------------------------------------------------------------------------------
1 | //
2 | // Shaders.metal
3 | // ParallelDP
4 | //
5 | // Created by Jiheng Zhang on 1/1/2016.
6 | // Copyright © 2016 verse. All rights reserved.
7 | //
8 |
9 | #include
10 | using namespace metal;
11 |
12 | #define max_dimension 13
13 |
14 | kernel void initialize(const device uint *batch[[buffer(1)]],
15 | const device float *parameters[[buffer(2)]],
16 | device float *initValue [[buffer(0)]],
17 | uint id [[ thread_position_in_grid ]]) {
18 |
19 | // get the parameters
20 | float salvageValue = parameters[2];
21 |
22 | // find current and parend id
23 | uint idCurrent = batch[0]*batch[1]+id;
24 | uint idParent = idCurrent - batch[0];
25 |
26 | initValue[idCurrent] = initValue[idParent] + salvageValue; // discountRate * orderCost - holdingCost;
27 | }
28 |
29 | // compute the value funciton of the optimal policy
30 | kernel void iterate(const device uint *batch[[buffer(4)]],
31 | const device float *parameters[[buffer(5)]],
32 | const device float *distribution[[buffer(6)]],
33 | const device float *inVector [[buffer(0)]],
34 | device float *outVector [[ buffer(1) ]],
35 | device float *deplete[[buffer(2)]],
36 | device float *order[[buffer(3)]],
37 | uint id [[ thread_position_in_grid ]]) {
38 |
39 | // get the parameters
40 | int K = int(parameters[0]), L = int(parameters[1]);
41 | int max_demand = int(parameters[8]);
42 | float salvageValue = parameters[2];
43 | float holdingCost = parameters[3];
44 | float orderCost = parameters[4];
45 | float disposalCost = parameters[5];
46 | float discountRate = parameters[6];
47 | float price = parameters[7];
48 |
49 | // find current and parend id
50 | uint idCurrent = batch[0]*batch[1]+id;
51 | uint idParent = idCurrent - batch[0];
52 |
53 | // prepare a vector for decode
54 | int idState[max_dimension + 1];
55 |
56 | // range of optimization
57 | int min_deplete = 0, max_deplete = 1;
58 | int min_order = 0, max_order = K;
59 |
60 | if (idCurrent != 0){
61 | min_deplete = int(deplete[idParent]) + int(deplete[idParent] != 0.);
62 | max_deplete = int(deplete[idParent]) + 2;
63 | int min_order_1 = int(order[idParent]) + int(deplete[idParent] != 0.) - 1;
64 | min_order = min_order_1 * int(min_order_1 >= 0);
65 | max_order = int(order[idParent]) + 1;
66 | }
67 |
68 | int opt_deplete = 0;
69 | int opt_order = 0;
70 | float opt_value = 0.;
71 | float state_value = 0.;
72 |
73 |
74 | for (int i = min_deplete; i < max_deplete; i++){
75 | for (int j = min_order; j < max_order; j++){
76 | state_value= 0.;
77 | for (int d = 0; d < max_demand; d++){
78 | // decode idCurrent into idState
79 | int idSum= 0, index= idCurrent;
80 | for (int l = L - 1; l >= 0; l--) {
81 | idState[l] = index % K;
82 | idSum += idState[l];
83 | index /= K;
84 | }
85 | idState[L] = 0;
86 | //deplete i units from idState
87 | int remain_deplete = i;
88 | for (int l = 0; l < L; l++) {
89 | if (remain_deplete <= idState[l]) {
90 | idState[l] -= remain_deplete;
91 | break;
92 | } else {
93 | remain_deplete -= idState[l];
94 | idState[l] = 0;
95 | }
96 | }
97 | //holding cost incurred
98 | int hold = idSum - i;
99 | //order j units
100 | idState[L]= j;
101 | //sell d units from idState
102 | int sell = 0, remain_sell = d;
103 | for (int l = 0; l < L+ 1; l++) {
104 | if (remain_sell <= idState[l]) {
105 | sell += remain_sell;
106 | idState[l] -= remain_sell;
107 | break;
108 | } else {
109 | remain_sell -= idState[l];
110 | sell += idState[l];
111 | idState[l] = 0;
112 | }
113 | }
114 | //dispose expired terms
115 | int dispose = idState[0];
116 | idState[0]= 0;
117 | //get the index of the future state
118 | int future = 0;
119 | for (int l = 1; l < L + 1; l++) {
120 | future *= K;
121 | future += idState[l];
122 | }
123 | //get the value with respect to i, j, d
124 | float state_value_sample = salvageValue * i
125 | - holdingCost * hold
126 | + discountRate * (-orderCost * j
127 | + price * sell
128 | - disposalCost * dispose
129 | + inVector[future]);
130 | state_value += (state_value_sample * distribution[d]);
131 | }
132 | if (state_value > opt_value + 1e-6){
133 | opt_value = state_value;
134 | opt_deplete = i;
135 | opt_order = j;
136 | }
137 | }
138 | }
139 |
140 | outVector[idCurrent] = opt_value;
141 | deplete[idCurrent] = float(opt_deplete);
142 | order[idCurrent] = float(opt_order);
143 |
144 | }
145 |
146 | // compute the value function of the fluid policy
147 | kernel void iterate_fluid(const device uint *batch[[buffer(4)]],
148 | const device float *parameters[[buffer(5)]],
149 | const device float *distribution[[buffer(6)]],
150 | const device float *inVector [[buffer(0)]],
151 | device float *outVector [[ buffer(1) ]],
152 | device float *deplete[[buffer(2)]],
153 | device float *order[[buffer(3)]],
154 | uint id [[ thread_position_in_grid ]]) {
155 |
156 | // get the parameters
157 | int K = int(parameters[0]), L = int(parameters[1]), numPeriods= int(parameters[10]);
158 | int max_demand = int(parameters[8]);
159 | int mean_demand = int(parameters[9]);
160 | float salvageValue = parameters[2];
161 | float holdingCost = parameters[3];
162 | float orderCost = parameters[4];
163 | float disposalCost = parameters[5];
164 | float discountRate = parameters[6];
165 | float price = parameters[7];
166 |
167 | int t = batch[2];
168 |
169 | // find current and parend id
170 | uint idCurrent = batch[0]*batch[1]+id;
171 |
172 | // prepare a vector for decode
173 | int idState[max_dimension + 1];
174 |
175 | int idSum= 0, index= idCurrent;
176 | for (int l = L - 1; l >= 0; l--) {
177 | idState[l] = index % K;
178 | idSum += idState[l];
179 | index /= K;
180 | }
181 | idState[L] = 0;
182 |
183 | int fluid_deplete = 0, fluid_order = 0;
184 | // determine the action for a specific state
185 | if (t < numPeriods -1) {
186 | if (mean_demand - idSum > 1e-6){
187 | fluid_order = int(mean_demand - idSum);
188 | }
189 | }
190 | else {
191 | int idPartialSum = 0;
192 | for (int l = 1; l < L + 1; l++) {
193 | idPartialSum += idState[l-1];
194 | if (idPartialSum - l* mean_demand > fluid_deplete + 1e-6){
195 | fluid_deplete = idPartialSum - l* mean_demand;
196 | }
197 | }
198 | if (mean_demand + fluid_deplete - idSum > 1e-6){
199 | fluid_order = int(mean_demand + fluid_deplete - idSum);
200 | }
201 |
202 | }
203 | float opt_value = 0, state_value = 0;
204 |
205 | for (int i = fluid_deplete; i < fluid_deplete + 1; i++){
206 | for (int j = fluid_order; j < fluid_order + 1; j++){
207 | state_value= 0.;
208 | for (int d = 0; d < max_demand; d++){
209 | // decode idCurrent into idState
210 | int idSum= 0, index= idCurrent;
211 | for (int l = L - 1; l >= 0; l--) {
212 | idState[l] = index % K;
213 | idSum += idState[l];
214 | index /= K;
215 | }
216 | idState[L] = 0;
217 | //deplete i units from idState
218 | int remain_deplete = i;
219 | for (int l = 0; l < L; l++) {
220 | if (remain_deplete <= idState[l]) {
221 | idState[l] -= remain_deplete;
222 | break;
223 | } else {
224 | remain_deplete -= idState[l];
225 | idState[l] = 0;
226 | }
227 | }
228 | //holding cost incurred
229 | int hold = idSum - i;
230 | //order j units
231 | idState[L]= j;
232 | //sell d units from idState
233 | int sell = 0, remain_sell = d;
234 | for (int l = 0; l < L+ 1; l++) {
235 | if (remain_sell <= idState[l]) {
236 | sell += remain_sell;
237 | idState[l] -= remain_sell;
238 | break;
239 | } else {
240 | remain_sell -= idState[l];
241 | sell += idState[l];
242 | idState[l] = 0;
243 | }
244 | }
245 | //dispose expired terms
246 | int dispose = idState[0];
247 | idState[0]= 0;
248 | //get the index of the future state
249 | int future = 0;
250 | for (int l = 1; l < L + 1; l++) {
251 | future *= K;
252 | future += idState[l];
253 | }
254 | //get the value with respect to i, j, d
255 | float state_value_sample = salvageValue * i
256 | - holdingCost * hold
257 | + discountRate * (-orderCost * j
258 | + price * sell
259 | - disposalCost * dispose
260 | + inVector[future]);
261 | state_value += (state_value_sample * distribution[d]);
262 | }
263 | if (state_value > opt_value + 1e-6){
264 | opt_value = state_value;
265 | }
266 | }
267 | }
268 |
269 | outVector[idCurrent] = opt_value;
270 | deplete[idCurrent] = float(fluid_deplete);
271 | order[idCurrent] = float(fluid_order);
272 |
273 | }
274 |
275 | // compute the value function of the NVP policy
276 | kernel void iterate_NVP(const device uint *batch[[buffer(4)]],
277 | const device float *parameters[[buffer(5)]],
278 | const device float *distribution[[buffer(6)]],
279 | const device float *inVector [[buffer(0)]],
280 | device float *outVector [[ buffer(1) ]],
281 | device float *deplete[[buffer(2)]],
282 | device float *order[[buffer(3)]],
283 | uint id [[ thread_position_in_grid ]]) {
284 |
285 | // get the parameters
286 | int K = int(parameters[0]), L = int(parameters[1]), numPeriods= int(parameters[10]);
287 | int max_demand = int(parameters[8]);
288 | float salvageValue = parameters[2];
289 | float holdingCost = parameters[3];
290 | float orderCost = parameters[4];
291 | float disposalCost = parameters[5];
292 | float discountRate = parameters[6];
293 | float price = parameters[7];
294 |
295 | int t = batch[2];
296 |
297 | // find current and parend id
298 | uint idCurrent = batch[0]*batch[1]+id;
299 | uint idParent = idCurrent - batch[0];
300 |
301 | // prepare a vector for decode
302 | int idState[max_dimension + 1];
303 |
304 | int idSum= 0, index= idCurrent;
305 | for (int l = L - 1; l >= 0; l--) {
306 | idState[l] = index % K;
307 | idSum += idState[l];
308 | index /= K;
309 | }
310 | idState[L] = 0;
311 |
312 | int min_deplete = 0, max_deplete = 1;
313 | int min_order = 0, max_order = K;
314 |
315 | if (t < numPeriods- 1) {
316 | if (K-1 - idSum > 1e-6){
317 | min_order = int(K-1 - idSum);
318 | max_order = int(K-1 - idSum) + 1;
319 | }
320 | }
321 | else {
322 | if (idCurrent != 0){
323 | min_deplete = int(deplete[idParent]) + int(deplete[idParent] != 0.);
324 | max_deplete = int(deplete[idParent]) + 2;
325 | int min_order_1 = int(order[idParent]) + int(deplete[idParent] != 0.) - 1;
326 | min_order = min_order_1 * int(min_order_1 >= 0);
327 | max_order = int(order[idParent]) + 1;
328 | }
329 | }
330 |
331 | int NVP_deplete = 0;
332 | int NVP_order = 0;
333 | float opt_value = 0.;
334 | float state_value = 0.;
335 |
336 | for (int i = min_deplete; i < max_deplete; i++){
337 | for (int j = min_order; j < max_order; j++){
338 | state_value= 0.;
339 | for (int d = 0; d < max_demand; d++){
340 | // decode idCurrent into idState
341 | int idSum= 0, index= idCurrent;
342 | for (int l = L - 1; l >= 0; l--) {
343 | idState[l] = index % K;
344 | idSum += idState[l];
345 | index /= K;
346 | }
347 | idState[L] = 0;
348 | //deplete i units from idState
349 | int remain_deplete = i;
350 | for (int l = 0; l < L; l++) {
351 | if (remain_deplete <= idState[l]) {
352 | idState[l] -= remain_deplete;
353 | break;
354 | } else {
355 | remain_deplete -= idState[l];
356 | idState[l] = 0;
357 | }
358 | }
359 | //holding cost incurred
360 | int hold = idSum - i;
361 | //order j units
362 | idState[L]= j;
363 | //sell d units from idState
364 | int sell = 0, remain_sell = d;
365 | for (int l = 0; l < L+ 1; l++) {
366 | if (remain_sell <= idState[l]) {
367 | sell += remain_sell;
368 | idState[l] -= remain_sell;
369 | break;
370 | } else {
371 | remain_sell -= idState[l];
372 | sell += idState[l];
373 | idState[l] = 0;
374 | }
375 | }
376 | //dispose expired terms
377 | int dispose = idState[0];
378 | idState[0]= 0;
379 | //get the index of the future state
380 | int future = 0;
381 | for (int l = 1; l < L + 1; l++) {
382 | future *= K;
383 | future += idState[l];
384 | }
385 | //get the value with respect to i, j, d
386 | float state_value_sample = salvageValue * i
387 | - holdingCost * hold
388 | + discountRate * (-orderCost * j
389 | + price * sell
390 | - disposalCost * dispose
391 | + inVector[future]);
392 | state_value += (state_value_sample * distribution[d]);
393 | }
394 | if (state_value > opt_value + 1e-6){
395 | opt_value = state_value;
396 | NVP_deplete = i;
397 | NVP_order = j;
398 | }
399 | }
400 | }
401 |
402 | outVector[idCurrent] = opt_value;
403 | deplete[idCurrent] = float(NVP_deplete);
404 | order[idCurrent] = float(NVP_order);
405 |
406 | }
407 |
408 | // compute the value function of the NVP_variant policy
409 | kernel void iterate_NVP_1(const device uint *batch[[buffer(4)]],
410 | const device float *parameters[[buffer(5)]],
411 | const device float *distribution[[buffer(6)]],
412 | const device float *inVector [[buffer(0)]],
413 | device float *outVector [[ buffer(1) ]],
414 | device float *deplete[[buffer(2)]],
415 | device float *order[[buffer(3)]],
416 | uint id [[ thread_position_in_grid ]]) {
417 |
418 | // get the parameters
419 | int K = int(parameters[0]), L = int(parameters[1]);
420 | int max_demand = int(parameters[8]);
421 | float salvageValue = parameters[2];
422 | float holdingCost = parameters[3];
423 | float orderCost = parameters[4];
424 | float disposalCost = parameters[5];
425 | float discountRate = parameters[6];
426 | float price = parameters[7];
427 |
428 | int deplete_down_to = int(parameters[11]);
429 |
430 | // find current and parend id
431 | uint idCurrent = batch[0]*batch[1]+id;
432 |
433 | // prepare a vector for decode
434 | int idState[max_dimension + 1];
435 |
436 | int idSum= 0, index= idCurrent;
437 | for (int l = L - 1; l >= 0; l--) {
438 | idState[l] = index % K;
439 | idSum += idState[l];
440 | index /= K;
441 | }
442 | idState[L] = 0;
443 |
444 | int NVP_1_deplete = 0, NVP_1_order = 0;
445 | // determine the action for a specific state
446 | if (idState[0] > deplete_down_to) {
447 | NVP_1_deplete = idState[0]- deplete_down_to;
448 | }
449 | if (K-1 + NVP_1_deplete - idSum > 1e-6){
450 | NVP_1_order = int(K- 1 + NVP_1_deplete - idSum);
451 | }
452 |
453 |
454 | float opt_value = 0, state_value = 0;
455 |
456 | for (int i = NVP_1_deplete; i < NVP_1_deplete + 1; i++){
457 | for (int j = NVP_1_order; j < NVP_1_order + 1; j++){
458 | state_value= 0.;
459 | for (int d = 0; d < max_demand; d++){
460 | // decode idCurrent into idState
461 | int idSum= 0, index= idCurrent;
462 | for (int l = L - 1; l >= 0; l--) {
463 | idState[l] = index % K;
464 | idSum += idState[l];
465 | index /= K;
466 | }
467 | idState[L] = 0;
468 | //deplete i units from idState
469 | int remain_deplete = i;
470 | for (int l = 0; l < L; l++) {
471 | if (remain_deplete <= idState[l]) {
472 | idState[l] -= remain_deplete;
473 | break;
474 | } else {
475 | remain_deplete -= idState[l];
476 | idState[l] = 0;
477 | }
478 | }
479 | //holding cost incurred
480 | int hold = idSum - i;
481 | //order j units
482 | idState[L]= j;
483 | //sell d units from idState
484 | int sell = 0, remain_sell = d;
485 | for (int l = 0; l < L+ 1; l++) {
486 | if (remain_sell <= idState[l]) {
487 | sell += remain_sell;
488 | idState[l] -= remain_sell;
489 | break;
490 | } else {
491 | remain_sell -= idState[l];
492 | sell += idState[l];
493 | idState[l] = 0;
494 | }
495 | }
496 | //dispose expired terms
497 | int dispose = idState[0];
498 | idState[0]= 0;
499 | //get the index of the future state
500 | int future = 0;
501 | for (int l = 1; l < L + 1; l++) {
502 | future *= K;
503 | future += idState[l];
504 | }
505 | //get the value with respect to i, j, d
506 | float state_value_sample = salvageValue * i
507 | - holdingCost * hold
508 | + discountRate * (-orderCost * j
509 | + price * sell
510 | - disposalCost * dispose
511 | + inVector[future]);
512 | state_value += (state_value_sample * distribution[d]);
513 | }
514 | if (state_value > opt_value + 1e-6){
515 | opt_value = state_value;
516 | }
517 | }
518 | }
519 |
520 | outVector[idCurrent] = opt_value;
521 | deplete[idCurrent] = float(NVP_1_deplete);
522 | order[idCurrent] = float(NVP_1_order);
523 |
524 | }
525 |
526 |
527 | // to test the understanding of thread related concepts
528 | kernel void testThread(device float *result [[ buffer(0) ]],
529 | uint id [[ thread_position_in_grid ]],
530 | uint i [[ thread_position_in_threadgroup ]],
531 | uint w [[ threadgroup_position_in_grid ]],
532 | uint S [[ threads_per_threadgroup ]]) {
533 |
534 | if (id == w*S+i)
535 | result[id] = id;
536 | else
537 | result[id] = 0;
538 | }
539 |
--------------------------------------------------------------------------------