├── images ├── i7.jpg ├── GK110.jpg ├── Julia6x.png ├── warp-branch.png ├── false-sharing.gif └── 40-years-processor-trend.png ├── Project.toml ├── 100 Overview.jl ├── README.jl ├── 060 Tasks.jl ├── 080 GPUs.jl ├── 030 SIMD.jl ├── 020 Serial Performance.jl ├── 050 Parallel Algorithms.jl ├── 040 Multithreading.jl ├── 070 Distributed.jl └── Manifest.toml /images/i7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/i7.jpg -------------------------------------------------------------------------------- /images/GK110.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/GK110.jpg -------------------------------------------------------------------------------- /images/Julia6x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/Julia6x.png -------------------------------------------------------------------------------- /images/warp-branch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/warp-branch.png -------------------------------------------------------------------------------- /images/false-sharing.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/false-sharing.gif -------------------------------------------------------------------------------- /images/40-years-processor-trend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/40-years-processor-trend.png -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 3 | Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" 4 | Compose = "a81c6b42-2e10-5240-aca2-a61377ecd94b" 5 | Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" 6 | CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" 7 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" 8 | DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94" 9 | FileWatching = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" 10 | FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" 11 | Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" 12 | Gadfly = "c91e804a-d5a3-530f-b6f0-dfbca275c004" 13 | Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d" 14 | PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" 15 | SharedArrays = "1a1011a3-84de-559e-8e89-a11a2f7dc383" 16 | -------------------------------------------------------------------------------- /100 Overview.jl: -------------------------------------------------------------------------------- 1 | # # Summary 2 | # 3 | # * Challenges of parallel computing 4 | # * Order of execution 5 | # * execution of out order of Possibility 6 | # * simultaneous access and mutation 7 | # * Data access and movement 8 | # * Code access and movement 9 | # * Appropriately matching the parallelism strategy to your machine capabilities 10 | # * Appropriately matching the parallelism strategy with the problem at hand 11 | # 12 | # * Parallelism strategies 13 | # * SIMD 14 | # * Multithreading 15 | # * Tasks 16 | # * Multi-process 17 | # * Shared memory 18 | # * Distributed memory 19 | # * GPU programming 20 | # 21 | # 22 | # ## Why so many kinds of parallelism? 23 | # 24 | # * Not all problems are created equal 25 | # * Not all computing machines are created equal 26 | # * We want to maximize comuting while minimizing overhead 27 | # * Chosen solution will depend upon the amount of computing in each inner loop 28 | # and the amount of syncronization that is required between loops. 29 | 30 | -------------------------------------------------------------------------------- /README.jl: -------------------------------------------------------------------------------- 1 | # # JuliaCon 2019 Parallel Computing Workshop 2 | # 3 | # This workshop will cover: 4 | # 5 | # * Introduction to parallelism 6 | # * What is happening to our computers? 7 | # 8 | # * Parallelism strategies 9 | # * SIMD and best single-core performance (brief overview) 10 | # * Multi-threading (hands on) 11 | # * Cooperative multi-tasking 12 | # * Parallel algorithm design 13 | # * Multi-process (hands on) 14 | # * Shared memory 15 | # * Distributed memory 16 | # * GPU programming 17 | # 18 | # * Challenges of parallel computing 19 | # * Order of execution 20 | # * execution of out order of Possibility 21 | # * race conditions with simultaneous access and mutation 22 | # * Data access and movement 23 | # * Code access and movement 24 | # * Appropriately matching the parallelism strategy to your machine capabilities 25 | # * Appropriately matching the parallelism strategy with the problem at hand 26 | 27 | #- 28 | 29 | # ## What is happening to our computers!? 30 | # 31 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/40-years-processor-trend.png) 32 | # 33 | # Not only have we gained multiple cores, but processors have become extremely 34 | # complex, with multiple levels of caches, pipelines, predictions, speculations... 35 | # 36 | # ## What is hard about parallel computing 37 | # * We don't think in parallel 38 | # * We learn to write and reason about programs serially 39 | # * The desire for parallelism often comes _after_ you've written your algorithm (and found it too slow!) 40 | # 41 | # ## Summary: 42 | # * Current computer archetectures push us towards parallel programming for peak performance — even if we're not on a cluster! 43 | # * But it's hard to design good parallel algorithms 44 | # * And it's hard to express and reason about those algorithms 45 | -------------------------------------------------------------------------------- /060 Tasks.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # A brief introduction to Tasks 4 | # 5 | # You're working on a computer that's doing _lots_ of things. It's managing 6 | # inputs, outputs, delegating control of the CPU between Julia and _all_ of 7 | # the other applications you have running. This wasn't always the case — does 8 | # anyone remember the days before you could just switch between applications? 9 | # 10 | # It's not really doing all these things at once, but for the most part it 11 | # gives the _appearance_ of parallelism. We think about our computers as doing 12 | # _lots_ of things simultaneously — but it's not really simultaneous. It's just 13 | # switching between tasks so fast that it feels simultaneous. 14 | # 15 | # This kind of task switching is perfect for situations like an operating system 16 | # where you're just waiting for user input most of the time. The OS multitasking 17 | # you're familiar with is called "preemptive" multitasking — the operating system 18 | # sits at the top and can arbitrarily control who gets to run when. Julia's task 19 | # system uses cooperative multitasking (also known as coroutines or green threads). 20 | 21 | #- 22 | 23 | # Tasks work best when they're waiting for some _external_ condition to complete 24 | # their work. Let's say we had a directory "results" and wanted to process any 25 | # new files that appeared there: 26 | 27 | using FileWatching 28 | isdir("results") || mkdir("results") 29 | watch_folder("results", #= time out in seconds =# 5) 30 | 31 | # Julia happily will sit there and wait for something to happen... but it's 32 | # blocking anything else from happening while it's doing so! This is the perfect 33 | # case for a Task. We can say we want a given expression to run asynchronously 34 | # in a Task with the `@async` macro 35 | 36 | t = @async watch_folder("results") # no timeout means it will wait forever! 37 | 38 | #- 39 | 40 | run(`touch results/0.txt`) 41 | 42 | #- 43 | 44 | file, info = fetch(t) 45 | file # |> process 46 | 47 | # We can even bundle this up into a repeating task: 48 | 49 | isdone = false 50 | function process_folder(dir) 51 | !isdir("processed-results") && mkdir("processed-results") 52 | while !isdone 53 | file, info = watch_folder(dir) 54 | path = joinpath(dir, file) 55 | if isfile(path) 56 | print("processing $path...") 57 | run(`cp $path processed-results/$file`) # Or actually do real work... 58 | end 59 | end 60 | end 61 | 62 | t = @async process_folder("results") 63 | 64 | #- 65 | 66 | run(`touch results/1.txt`) 67 | readdir("processed-results") 68 | 69 | #- 70 | 71 | run(`touch results/2.txt`) 72 | readdir("processed-results") 73 | 74 | #- 75 | 76 | isdone = true 77 | run(`touch results/3.txt`) 78 | readdir("processed-results") 79 | 80 | #- 81 | 82 | run(`touch results/4.txt`) 83 | readdir("processed-results") 84 | 85 | #- 86 | 87 | rm("results", recursive=true) 88 | rm("processed-results", recursive=true) 89 | 90 | # ## Quiz: 91 | # 92 | # How long will this take? 93 | 94 | @time for i in 1:10 95 | sleep(1) 96 | end 97 | 98 | # What about this? 99 | 100 | @time for i in 1:10 101 | @async sleep(1) 102 | end 103 | 104 | # And finally, this? 105 | 106 | @time @sync for i in 1:10 107 | @async sleep(1) 108 | end 109 | 110 | # Now what if I had something that actually did work? 111 | 112 | function work(N) 113 | series = 1.0 114 | for i in 1:N 115 | series += (isodd(i) ? -1 : 1) / (i*2+1) 116 | end 117 | return 4*series 118 | end 119 | work(1) 120 | @time work(100_000_000) 121 | 122 | #- 123 | 124 | @time @sync for i in 1:10 125 | @async work(100_000_000) 126 | end 127 | 128 | # # So what's happening here? 129 | # 130 | # `sleep` is nicely cooperating with our tasks 131 | 132 | methods(sleep) 133 | 134 | # # Fetching values from tasks 135 | 136 | # You can even fetch values from tasks 137 | 138 | t = @async (sleep(5); rand()) 139 | 140 | wait(t) 141 | 142 | fetch(t) 143 | 144 | # # Key takeaways 145 | # 146 | # There is a lot more to tasks, but they form the foundation for reasoning about 147 | # actually _doing_ computation in parallel (and not just hoping that things will 148 | # cooperate for us to emulate parallelism by task switching). 149 | # 150 | # * `@async` creates and starts running a task 151 | # * `@sync` waits for them to all complete 152 | # * We can reason about something that runs asynchronously and may return a value 153 | # at some point in the future with `fetch`. Or we can just `wait` for it. 154 | -------------------------------------------------------------------------------- /080 GPUs.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # GPUs 4 | # 5 | # The graphics processor in your computer is _itself_ like a mini-computer highly 6 | # tailored for massively and embarassingly parallel operations (like computing how light will bounce 7 | # off of every point on a 3D mesh of triangles). 8 | # 9 | # Of course, recently their utility in other applications has become more clear 10 | # and thus the GPGPU was born. 11 | # 12 | # Just like how we needed to send data to other processes, we need to send our 13 | # data to the GPU to do computations there. 14 | 15 | #- 16 | 17 | # ## How is a GPU different from a CPU? 18 | # 19 | # This is what a typical consumer CPU looks like: 20 | # 21 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/i7.jpg) 22 | # 23 | # And this is what a GPU looks like: 24 | # 25 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/GK110.jpg) 26 | # 27 | # Each SMX isn't just one "core", each is a _streaming multiprocessor_ capable of running hundreds of threads simultaneously itself. There are so many threads, in fact, that you reason about them in groups of 32 — called a "warp." No, no [that warp](https://www.google.com/search?tbm=isch&q=warp&tbs=imgo:1), [this one](https://www.google.com/search?tbm=isch&q=warp%20weaving&tbs=imgo:1). 28 | # 29 | # The card above supports up to 6 warps per multiprocessor, with 32 threads each, times 15 multiprocessors... 2880 threads at a time! 30 | # 31 | # Also note the memory interfaces. 32 | # 33 | # -------------- 34 | # 35 | # Each thread is relatively limited — and a warp is almost like a SIMD unit that supports branching. Except it's still only executing one instruction even after a branch: 36 | # 37 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/warp-branch.png) 38 | 39 | #- 40 | 41 | # You can inspect the installed GPUs with nvidia-smi: 42 | 43 | run(`nvidia-smi`) 44 | 45 | # ## Example 46 | # 47 | # The deep learning MNIST example: https://fluxml.ai/experiments/mnist/ 48 | # 49 | # This is how it looks on the CPU: 50 | 51 | using Flux, Flux.Data.MNIST, Statistics 52 | using Flux: onehotbatch, onecold, crossentropy, throttle 53 | using Base.Iterators: repeated, partition 54 | 55 | imgs = MNIST.images() 56 | labels = onehotbatch(MNIST.labels(), 0:9) 57 | 58 | ## Partition into batches of size 32 59 | train = [(cat(float.(imgs[i])..., dims = 4), labels[:,i]) 60 | for i in partition(1:60_000, 32)] 61 | ## Prepare test set (first 1,000 images) 62 | tX = cat(float.(MNIST.images(:test)[1:1000])..., dims = 4) 63 | tY = onehotbatch(MNIST.labels(:test)[1:1000], 0:9) 64 | 65 | m = Chain( 66 | Conv((3, 3), 1=>32, relu), 67 | Conv((3, 3), 32=>32, relu), 68 | x -> maxpool(x, (2,2)), 69 | Conv((3, 3), 32=>16, relu), 70 | x -> maxpool(x, (2,2)), 71 | Conv((3, 3), 16=>10, relu), 72 | x -> reshape(x, :, size(x, 4)), 73 | Dense(90, 10), softmax) 74 | 75 | loss(x, y) = crossentropy(m(x), y) 76 | accuracy(x, y) = mean(onecold(m(x)) .== onecold(y)) 77 | opt = ADAM() 78 | Flux.train!(loss, train[1:1], opt, cb = () -> @show(accuracy(tX, tY))) 79 | @time Flux.train!(loss, Flux.params(m), train[1:10], opt, cb = () -> @show(accuracy(tX, tY))) 80 | 81 | # Now let's re-do it on a GPU. "All" it takes is moving the data there with `gpu`! 82 | 83 | include(datapath("scripts/fixupCUDNN.jl")) # JuliaBox uses an old version of CuArrays; this backports a fix for it 84 | gputrain = gpu.(train[1:10]) 85 | gpum = gpu(m) 86 | gputX = gpu(tX) 87 | gputY = gpu(tY) 88 | gpuloss(x, y) = crossentropy(gpum(x), y) 89 | gpuaccuracy(x, y) = mean(onecold(gpum(x)) .== onecold(y)) 90 | gpuopt = ADAM() 91 | Flux.train!(gpuloss, Flux.params(gpum), gpu.(train[1:1]), gpuopt, cb = () -> @show(gpuaccuracy(gputX, gputY))) 92 | @time Flux.train!(gpuloss, Flux.params(gpum), gputrain, gpuopt, cb = () -> @show(gpuaccuracy(gputX, gputY))) 93 | 94 | # ## Defining your own GPU kernels 95 | # 96 | # So that's leveraging Flux's ability to work with GPU arrays — which is magical 97 | # and awesome — but you don't always have a library to lean on like that. 98 | # How might you define your own GPU kernel? 99 | # 100 | # Recall the monte carlo pi example: 101 | 102 | function serialpi(n) 103 | inside = 0 104 | for i in 1:n 105 | x, y = rand(), rand() 106 | inside += (x^2 + y^2 <= 1) 107 | end 108 | return 4 * inside / n 109 | end 110 | 111 | # How could we express this on the GPU? 112 | 113 | using CuArrays.CURAND 114 | function findpi_gpu(n) 115 | 4 * sum(curand(Float64, n).^2 .+ curand(Float64, n).^2 .<= 1) / n 116 | end 117 | findpi_gpu(10_000_000) 118 | 119 | #- 120 | 121 | using BenchmarkTools 122 | @btime findpi_gpu(10_000_000) 123 | @btime serialpi(10_000_000) 124 | 125 | # That leans on broadcast to build the GPU kernel — and is creating three arrays 126 | # in the process — but it's still much faster than our serial pi from before. 127 | 128 | #- 129 | 130 | # In general, using CuArrays and broadcast is one of the best ways to just 131 | # get everything to work. If you really want to get your hands dirty, you 132 | # can use [CUDAnative.jl](https://github.com/JuliaGPU/CUDAnative.jl) to manually specify exactly how everything works, 133 | # but be forewarned, it's not for the [faint at heart](https://github.com/JuliaGPU/CUDAnative.jl/blob/master/examples/reduce/reduce.jl)! (If you've done CUDA 134 | # programming in C or C++, it's very similar.) 135 | -------------------------------------------------------------------------------- /030 SIMD.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # SIMD: The parallelism that can (sometimes) happen automatically 4 | # 5 | # SIMD: Single-instruction, multiple data 6 | # 7 | # (Also confusingly called vectorization) 8 | 9 | #- 10 | 11 | # ## The architecture 12 | # 13 | # Instead of computing four sums sequentially: 14 | # 15 | # \begin{align} 16 | # x_1 + y_1 &\rightarrow z_1 \\ 17 | # x_2 + y_2 &\rightarrow z_2 \\ 18 | # x_3 + y_3 &\rightarrow z_3 \\ 19 | # x_4 + y_4 &\rightarrow z_4 20 | # \end{align} 21 | # 22 | # Modern processors have vector processing units that can do it all at once: 23 | # 24 | # $$ 25 | # \left(\begin{array}{cc} 26 | # x_1 \\ 27 | # x_2 \\ 28 | # x_3 \\ 29 | # x_4 30 | # \end{array}\right) 31 | # + 32 | # \left(\begin{array}{cc} 33 | # y_1 \\ 34 | # y_2 \\ 35 | # y_3 \\ 36 | # y_4 37 | # \end{array}\right) 38 | # \rightarrow 39 | # \left(\begin{array}{cc} 40 | # z_1 \\ 41 | # z_2 \\ 42 | # z_3 \\ 43 | # z_4 44 | # \end{array}\right) 45 | # $$ 46 | 47 | #- 48 | 49 | # ## Making it happen 50 | 51 | #- 52 | 53 | # Simple task: compute the sum of a vector: 54 | 55 | A = rand(100_000) 56 | function simplesum(A) 57 | result = zero(eltype(A)) 58 | for i in eachindex(A) 59 | @inbounds result += A[i] 60 | end 61 | return result 62 | end 63 | 64 | simplesum(A) 65 | 66 | #- 67 | 68 | using BenchmarkTools 69 | @btime simplesum($A) 70 | 71 | # So, is that good? 72 | 73 | @btime sum($A) 74 | 75 | # We're slower that the builtin `sum` — and we're getting a different answer, too! Let's look at what happens with a 32-bit float instead of a 64 bit one. Each element has half the number of bits, so lets also double the length (so the total number of bits processed remains constant). 76 | 77 | A32 = rand(Float32, length(A)*2) 78 | @btime simplesum($A32) 79 | @btime sum($A32); 80 | 81 | # That's even worse! What's going on here? We're seeing an even multiple number 82 | # difference in our performance — perhaps Julia's builtin sum is using some 83 | # parallelism? Let's try using SIMD ourselves: 84 | 85 | function simdsum(A) 86 | result = zero(eltype(A)) 87 | @simd for i in eachindex(A) 88 | @inbounds result += A[i] 89 | end 90 | return result 91 | end 92 | @btime simdsum($A) 93 | @btime simdsum($A32) 94 | 95 | # What did that do and why don't we always use `@simd for` — or why doesn't Julia 96 | # just always use `@simd` for every `for` loop automatically? Look at the values: 97 | 98 | simplesum(A), simdsum(A), sum(A) 99 | 100 | #- 101 | 102 | simplesum(A32), simdsum(A32), sum(A32) 103 | 104 | # Why aren't they the same? 105 | # 106 | # Without `@simd`, Julia is doing _exactly_ what we told it to do: it's taking 107 | # each element of our array and adding it to a big pile sequentially. Our answer 108 | # is smaller than what Julia's builtin `sum` thinks it is: that's because as our 109 | # pile gets bigger we begin losing the lower bits of each element that we're 110 | # adding, and those small losses begin to add up! 111 | # 112 | # The `@simd` macro tells Julia that it can re-arrange floating point additions — 113 | # even if it would change the answer. Depending on your CPU, this may lead to 2x or 4x 114 | # or even 8x parallelism. Essentially, Julia is computing independent sums for 115 | # the even indices and the odd indices simultaneously: 116 | # 117 | # \begin{align} 118 | # odds &\leftarrow 0 \\ 119 | # evens &\leftarrow 0 \\ 120 | # \text{loop}&\ \text{odd}\ i: \\ 121 | # &\left(\begin{array}{cc} 122 | # odds \\ 123 | # evens 124 | # \end{array}\right) 125 | # \leftarrow 126 | # \left(\begin{array}{cc} 127 | # odds \\ 128 | # evens 129 | # \end{array}\right) 130 | # + 131 | # \left(\begin{array}{cc} 132 | # x_{i} \\ 133 | # x_{i+1} 134 | # \end{array}\right) \\ 135 | # total &\leftarrow evens + odds 136 | # \end{align} 137 | # 138 | # In many cases, Julia can and does know that a for-loop can be SIMD-ed and it 139 | # will take advantage of this by default! 140 | 141 | B = rand(1:10, 100_000) 142 | @btime simplesum($B) 143 | @btime sum($B) 144 | B32 = rand(Int32(1):Int32(10), length(B)*2) 145 | @btime simplesum($B32) 146 | @btime simdsum($B32) 147 | 148 | # How can we see if something is getting vectorized? 149 | 150 | @code_llvm simdsum(A32) 151 | 152 | # So what are the challenges? 153 | # 154 | # * Biggest hurdle is that you have to convince Julia and LLVM that it's able to 155 | # use SIMD instructions for your given algorithm. That's not always possible. 156 | # * There are lots of limitations of what can and cannot be SIMD-ed: 157 | 158 | @doc @simd 159 | 160 | # * You do need to think through the consequences of re-ordering your algorithm. 161 | 162 | #- 163 | 164 | # ## A slightly trickier case 165 | 166 | using BenchmarkTools 167 | 168 | #- 169 | 170 | function diff!(A, B) 171 | A[1] = B[1] 172 | for i in 2:length(A) 173 | @inbounds A[i] = B[i] - B[i-1] 174 | end 175 | return A 176 | end 177 | A = zeros(Float32, 100_000) 178 | B = rand(Float32, 100_000) 179 | 180 | diff!(A, B) 181 | [B[1];diff(B)] == A 182 | 183 | #- 184 | 185 | @btime diff!($A, $B) 186 | @btime diff($B); 187 | 188 | # But what happens if we do it in-place? 189 | 190 | Bcopy = copy(B) 191 | @btime diff!($Bcopy, $Bcopy); 192 | 193 | # What happened? 194 | 195 | @code_llvm diff!(A, B) 196 | 197 | # We can manually assert that arrays don't alias (or have any loop-dependencies), 198 | # with the very special `@simd ivdep` flag, but this can be disastrous: 199 | 200 | function unsafe_diff!(A, B) 201 | A[1] = B[1] 202 | @simd ivdep for i in 2:length(A) 203 | @inbounds A[i] = B[i] - B[i-1] 204 | end 205 | return A 206 | end 207 | @btime unsafe_diff!($A, $B) 208 | [B[1];diff(B)] == A 209 | Bcopy = copy(B) 210 | unsafe_diff!(Bcopy, Bcopy) 211 | [B[1];diff(B)] == Bcopy 212 | 213 | # If you really want to get your hands dirty, you can use the [SIMD.jl](https://github.com/eschnett/SIMD.jl) 214 | # package to manually specify those `<8 x float>` things that LLVM generates. 215 | # BUT: this is tricky and a pain; often it's just to be aware of what makes 216 | # Julia code automatically SIMD-able, some of the cases where it may fail, and 217 | # how to check its work. 218 | 219 | #- 220 | 221 | # ## SIMD 222 | # 223 | # * Exploits built-in parallelism in a processor 224 | # * Best for small, tight innermost loops 225 | # * Often happens automatically if you're careful 226 | # * Follow the [perforance best practices](https://docs.julialang.org/en/v1/manual/performance-tips/) 227 | # * `@inbounds` any array acesses 228 | # * No branches or (non-inlined) function calls 229 | # * Can use `@simd` to allow Julia to break some rules to make it happen 230 | # * But be careful, especially with `@simd ivdep`! 231 | # * Depending on processor and types involved, can yield 2-16x gains with extraordinarily little overhead 232 | # * Smaller datatypes can improve this further; use `Float32` instead of `Float64` 233 | # if possible, `Int32` instead of `Int64`, etc. 234 | # * When buying a new processor, look for [AVX-512](https://en.wikichip.org/wiki/x86/avx-512) support 235 | -------------------------------------------------------------------------------- /020 Serial Performance.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # Fast (serial) programming with Julia 4 | # 5 | # Yes, this is a parallel computing course — but to write efficient parallel 6 | # programs we first must learn how to write fast serial Julia code. This is 7 | # a rapid primer in high performance (serial) programming. 8 | # 9 | # I _highly_ recommend reviewing the [Performance Tips](https://docs.julialang.org/en/v1.1/manual/performance-tips/) 10 | # in the manual. This is only going to briefly introduce some of the main concepts. 11 | 12 | #- 13 | 14 | # ## Measure, measure, measure. 15 | # 16 | # It is very easy to experiment in Julia; you can rapidly try many options and 17 | # see what is the fastest. 18 | 19 | #- 20 | 21 | # Use the [BenchmarkTools](https://github.com/JuliaCI/BenchmarkTools.jl) package: 22 | 23 | using BenchmarkTools 24 | 25 | """ 26 | findclosest(data, point) 27 | 28 | A simple example that returns the element in `data` that is closest to `point` 29 | """ 30 | function findclosest(data, point) 31 | _, index = findmin(abs.(data .- point)) 32 | return data[index] 33 | end 34 | data = rand(5000) 35 | findclosest(data, 0.5) 36 | 37 | #- 38 | 39 | @time findclosest(data, 0.5) 40 | 41 | #- 42 | 43 | @benchmark findclosest($data, $0.5) 44 | 45 | # ### Profile! 46 | 47 | using Profile 48 | 49 | Profile.clear() 50 | @profile for _ in 1:100000; findclosest(data, 0.5); end 51 | 52 | Profile.print(maxdepth=11) 53 | 54 | # ### Iterate! 55 | # 56 | # Before we had: 57 | # ```julia 58 | # function findclosest(data, point) 59 | # _, index = findmin(abs.(data .- point)) 60 | # return data[index] 61 | # end 62 | # ``` 63 | # 64 | # Let's come up with a new definition that can combine the two operations: 65 | 66 | function findclosest2(data, point) 67 | bestval = first(data) 68 | bestdist = abs(bestval - point) 69 | for elt in data 70 | dist = abs(elt - point) 71 | if dist < bestdist 72 | bestval = elt 73 | bestdist = dist 74 | end 75 | end 76 | return bestval 77 | end 78 | 79 | ## And do a spot-check to make sure we did the optimization correctly: 80 | findclosest2(data, 0.5) == findclosest(data, .5) 81 | 82 | #- 83 | 84 | @benchmark findclosest2($data, $0.5) 85 | 86 | # ## A quick word on macros 87 | # 88 | # Macros are those funny things starting with `@`. They can reinterpret what 89 | # you write and do something different — essentially introducing a new keyword. 90 | # 91 | # For example, the `@assert` macro simply takes an expression and throws an 92 | # exception if it returns `false`. 93 | 94 | @assert 2+2 == 4 95 | 96 | # It does this by literally re-writing what you wrote. You can see it in action 97 | # with `@macroexpand` 98 | 99 | @macroexpand @assert 2+2 == 4 100 | 101 | # Each macro can define its own special syntax, and this is used extensively for 102 | # code introspection, serial performance improvements, and — perhaps most 103 | # importantly — parallelization perimitives! 104 | 105 | #- 106 | 107 | # ## How is Julia fast? 108 | # 109 | # By understanding the basics of how Julia _can_ be fast, you can get a better 110 | # sense for how to write fast Julia code. 111 | # 112 | # Perhaps most importantly, Julia can reason about types. Recall: this is the definition of `findclosest2`: 113 | # 114 | # ```julia 115 | # function findclosest2(data, point) 116 | # bestval = first(data) 117 | # bestdist = abs(bestval - point) 118 | # for elt in data 119 | # dist = abs(elt - point) 120 | # if dist < bestdist 121 | # bestval = elt 122 | # bestdist = dist 123 | # end 124 | # end 125 | # return bestval 126 | # end 127 | # ``` 128 | 129 | @code_typed optimize=false findclosest2(data, 0.5) 130 | 131 | #- 132 | 133 | typeof(data) 134 | 135 | #- 136 | 137 | newdata = Real[data...] 138 | typeof(newdata) 139 | 140 | #- 141 | 142 | @code_typed optimize=false findclosest2(newdata, 0.5) 143 | 144 | #- 145 | 146 | @benchmark findclosest2(newdata, 0.5) 147 | 148 | #- 149 | 150 | @code_warntype findclosest2(newdata, 0.5) 151 | 152 | # ### Type stability 153 | # 154 | # A function is called type-stable if Julia is able to infer what the output 155 | # type will be based purely on the types of the inputs. 156 | # 157 | # Things that thwart type stability: 158 | # * Running things in global scope: create functions instead! 159 | # * Non-concretely typed containers 160 | # * Structs with abstractly-typed fields 161 | # * Non-constant globals (they might change!) 162 | # * Functions that change what they return based on the _values_: 163 | 164 | #- 165 | 166 | # #### More on macros 167 | # 168 | # Each and every macro can define its own syntax. The `@benchmark` macro uses `$` in a special way. 169 | # The goal behind `@benchmark` is to evaluate the performance of a code snippet 170 | # as though it were written in a function. Use `$` to flag what will be an argument 171 | # or local variable in the function. Forgetting to use `$`s may result in faster 172 | # or slower timings than real-world performance. 173 | 174 | x = 0.5 # non-constant global 175 | @btime sin(x) 176 | @btime sin($x) 177 | 178 | #- 179 | 180 | @btime sin(0.5) # constant literal! 181 | @btime sin($0.5) 182 | 183 | # ### Specializations 184 | # 185 | # Julia's reasoning about types is particularly important since it generates 186 | # specialized machine code specifically for the given arguments. 187 | 188 | @code_llvm 1 + 2 189 | 190 | # This applies just the same to any functions we write — even the more complicated ones: 191 | 192 | @code_llvm findclosest2(Float32[2.2,3.4,4.5],Float32(3.2)) 193 | 194 | # This applies just the same to any functions we write — even the more complicated ones: 195 | 196 | remove_comments(s) = join(filter(x->!startswith(x, ";"), split(s, "\n")), "\n") 197 | sprint(code_llvm, findclosest2, Tuple{Vector{Float32}, Int}) |> remove_comments |> print 198 | 199 | # ## Modern hardware effects 200 | # 201 | # There are lots of little performance quirks in modern computers; I'll just 202 | # cover two interesting ones here: 203 | 204 | @benchmark findclosest2($data, $0.5) 205 | 206 | #- 207 | 208 | sorteddata = sort(data) 209 | @benchmark findclosest2($sorteddata, $0.5) 210 | 211 | # Unfortunately, this isn't demonstrable on a hardened cloud platform... because 212 | # it's a huge security risk! 213 | # 214 | # * https://meltdownattack.com 215 | # * https://discourse.julialang.org/t/psa-microbenchmarks-remember-branch-history/17436 216 | 217 | idxs = sortperm(data) 218 | sortedview = @view data[idxs] 219 | @benchmark findclosest2($sortedview, $0.5) 220 | 221 | # ### Memory latencies 222 | # 223 | # | System Event | Actual Latency | Scaled Latency | 224 | # | ------------------------------ | -------------- | -------------- | 225 | # | One CPU cycle | 0.4 ns | 1 s | 226 | # | Level 1 cache access | 0.9 ns | 2 s | 227 | # | Level 2 cache access | 2.8 ns | 7 s | 228 | # | Level 3 cache access | 28 ns | 1 min | 229 | # | Main memory access (DDR DIMM) | ~100 ns | 4 min | 230 | # | Intel Optane memory access | <10 μs | 7 hrs | 231 | # | NVMe SSD I/O | ~25 μs | 17 hrs | 232 | # | SSD I/O | 50–150 μs | 1.5–4 days | 233 | # | Rotational disk I/O | 1–10 ms | 1–9 months | 234 | # | Internet call: SF to NYC | 65 ms | 5 years | 235 | # | Internet call: SF to Hong Kong | 141 ms | 11 years | 236 | # 237 | # (from https://www.prowesscorp.com/computer-latency-at-a-human-scale/) 238 | 239 | #- 240 | 241 | # # Key Takeaways 242 | # 243 | # * Measure, measure, measure! 244 | # * Get familiar with the [Performance Tips](https://docs.julialang.org/en/v1/manual/performance-tips/) 245 | # * Don't be scared of `@code_typed`/`@code_warntype` and `@code_llvm` 246 | -------------------------------------------------------------------------------- /050 Parallel Algorithms.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # Parallel Algorithms: Thinking in Parallel 4 | # 5 | # Now that we're starting to see the challenges of parallelism, it's worth taking 6 | # a step back and examining how we might go about designing parallel algorithms. 7 | # 8 | # This is adapted from a [workshop paper](http://jiahao.github.io/parallel-prefix/) by Jiahao Chen and 9 | # Alan Edelman entitled "Parallel Prefix Polymorphism Permits Parallelization, Presentation & Proof" and 10 | # will appear in the proceedings of the [First Workshop for High Performance Technical Computing in Dynamic 11 | # Languages](http://jiahao.github.io/hptcdl-sc14/), held in conjunction with [SC14: The International Conference on High Performance Computing, Networking, Storage and Analysis](http://sc14.supercomputing.org/) 12 | 13 | #- 14 | 15 | using Compose, Gadfly 16 | 17 | # # `reduce()` 18 | # 19 | # Reduction applies a binary operator to a vector repeatedly to return a scalar. Thus + becomes sum, and * becomes prod. 20 | # 21 | # It is considered a basic parallel computing primitive. 22 | 23 | reduce(+, 1:8) == sum(1:8) # triangular numbers 24 | 25 | #- 26 | 27 | reduce(*, 1:8) == prod(1:8) # factorials 28 | 29 | 30 | # You can also use reduce to compute Fibonacci numbers using their recurrences. 31 | 32 | M=[1 1; 1 0] 33 | reduce(*,fill(M,3)) 34 | prod(fill(M,3)) 35 | 36 | #- 37 | 38 | n= 40 # Try changing n to pick different values (try between 0-100) 39 | @show prod(fill(big.(M),n)) 40 | 41 | # # `prefix` or `scan` 42 | # 43 | # Having discussed `reduce`, we are now ready for the idea behind prefix sum. 44 | # Prefix or scan or accumulate is long considered an important parallel 45 | # primitive as well. 46 | # 47 | # Suppose you wanted to compute the partial sums of a vector, i.e. given 48 | # `y[1:n]`, we want to overwrite the vector `y` with the vector of partial sums 49 | # 50 | # ```julia 51 | # new_y[1] = y[1] 52 | # new_y[2] = y[1] + y[2] 53 | # new_y[3] = y[1] + y[2] + y[3] 54 | # ... 55 | # ``` 56 | # 57 | # At first blush, it seems impossible to parallelize this, since 58 | # 59 | # ```julia 60 | # new_y[1] = y[1] 61 | # new_y[2] = new_y[1] + y[2] 62 | # new_y[3] = new_y[2] + y[3] 63 | # ... 64 | # ``` 65 | # 66 | # which appears to be an intrinsically serial process. As written with a `+` 67 | # operator, this is `cumsum` — but note that it can generalize to any operation. 68 | 69 | function prefix_serial!(⊕, y) 70 | for i=2:length(y) 71 | y[i] = y[i-1] ⊕ y[i] 72 | end 73 | y 74 | end 75 | 76 | #- 77 | 78 | @show prefix_serial!(+, [1:8;]) 79 | @show cumsum(1:8) 80 | 81 | #- 82 | 83 | @show prefix_serial!(*, [1:8;]) 84 | @show cumprod(1:8) 85 | 86 | #- 87 | @show accumulate(*, [1:8;]) 88 | 89 | # However, it turns out that because these operations are associative, we can regroup the _order_ of how these sums or products are carried out. (This of course extends to other associative operations, too.) Another ordering of 8 associative operations is provided by `prefix8!`: 90 | 91 | ## Magic :) 92 | function prefix8!(⊕, y) 93 | length(y)==8 || error("length 8 only") 94 | for i in (2,4,6,8); y[i] = y[i-1] ⊕ y[i]; end 95 | for i in ( 4, 8); y[i] = y[i-2] ⊕ y[i]; end 96 | for i in ( 8); y[i] = y[i-4] ⊕ y[i]; end 97 | for i in ( 6 ); y[i] = y[i-2] ⊕ y[i]; end 98 | for i in ( 3,5,7 ); y[i] = y[i-1] ⊕ y[i]; end 99 | y 100 | end 101 | 102 | #- 103 | 104 | prefix8!(+, [1:8;]) == cumsum(1:8) 105 | 106 | # In fact, this can generalize beyond just length-8 arrays: 107 | 108 | ## More magic 109 | function prefix!(⊕, y) 110 | l=length(y) 111 | k=ceil(Int, log2(l)) 112 | @inbounds for j=1:k, i=2^j:2^j:min(l, 2^k) #"reduce" 113 | y[i] = y[i-2^(j-1)] ⊕ y[i] 114 | end 115 | @inbounds for j=(k-1):-1:1, i=3*2^(j-1):2^j:min(l, 2^k) #"expand" 116 | y[i] = y[i-2^(j-1)] ⊕ y[i] 117 | end 118 | y 119 | end 120 | 121 | # - 122 | 123 | A = rand(0:9, 123) 124 | prefix!(*, copy(A)) == cumprod(A) 125 | 126 | # ## What is this magic? 127 | 128 | #- 129 | 130 | # We can visualize the operations with a little bit of trickery. In Julia, arrays are simply types that expose the array protocol. In particular, they need to implement methods for the generic functions `length`, `getindex` and `setindex!`. The last two are used in indexing operations, since statements 131 | # 132 | # y = A[1] 133 | # A[3] = y 134 | # 135 | # get desugared to 136 | # 137 | # y = getindex(A, 1) 138 | # setindex!(A, y, 3) 139 | # 140 | # respectively. 141 | # 142 | # We can trace through the iterable by introduce a dummy array type, `AccessArray`, which records every access to `getindex` and `setindex!`. 143 | # 144 | # Specifically: 145 | # 146 | # - `length(A::AccessArray)` returns the length of the array it wraps 147 | # - `getindex(A::AccessArray, i)` records read access to the index `i` in the `A.read` field and then actually retuns the value in the array it wraps. 148 | # - `setindex!(A::AccessArray, x, i)` records write access to the index `i`. The `A.history` field is appended with a new tuple consisting of the current `A.read` field and the index `i`, and then it performs the assignment. 149 | # 150 | # The way `AccessArray` works, it assumes an association between a single `setindex!` call and and all the preceding `getindex` calls since the previous `setindex!` call, which is sufficient for the purposes of tracing through prefix calls. 151 | 152 | mutable struct AccessArray{T,N,A} 153 | data :: A 154 | read :: Vector{Int} 155 | history :: Vector{Tuple{Vector{Int},Int}} 156 | end 157 | AccessArray(A) = AccessArray{eltype(A), ndims(A), typeof(A)}(A, Int[], Int[]) 158 | 159 | Base.length(A::AccessArray) = length(A.data) 160 | 161 | function Base.getindex(A::AccessArray, i::Int) 162 | push!(A.read, i) 163 | A.data[i] 164 | end 165 | 166 | function Base.setindex!(A::AccessArray, x, i::Int) 167 | push!(A.history, (A.read, i)) 168 | A.read = Int[] 169 | A.data[i] = x 170 | end 171 | 172 | #- 173 | 174 | M = AccessArray(rand(8)) 175 | 176 | #- 177 | 178 | M[7] = M[3] + M[2] 179 | 180 | #- 181 | 182 | M.history 183 | 184 | # So now we can trace the access pattern when calling `prefix8`! 185 | 186 | A=prefix8!(+, AccessArray(rand(8))) 187 | 188 | #- 189 | 190 | A.history 191 | 192 | # Now let's visualize this! Each entry in `A.history` is rendered by a gate object: 193 | 194 | using Compose: circle, mm 195 | 196 | #- 197 | 198 | struct Gate{I,O} 199 | ins :: I 200 | outs :: O 201 | end 202 | 203 | import Gadfly.render 204 | 205 | function render(G::Gate, x₁, y₁, y₀; rᵢ=0.1, rₒ=0.25) 206 | ipoints = [(i, y₀+rᵢ) for i in G.ins] 207 | opoints = [(i, y₀+0.5) for i in G.outs] 208 | igates = [circle(i..., rᵢ) for i in ipoints] 209 | ogates = [circle(i..., rₒ) for i in opoints] 210 | lines = [line([i, j]) for i in ipoints, j in opoints] 211 | compose(context(units=UnitBox(0.5,0,x₁,y₁+1)), 212 | compose(context(), stroke(colorant"black"), fill(colorant"white"), 213 | igates..., ogates...), 214 | compose(context(), linewidth(0.3mm), stroke(colorant"black"), 215 | lines...)) 216 | end 217 | 218 | A=Gate([1,2],2) 219 | render(A,2,0,0) 220 | 221 | # Now we render the whole algorithm. We have to scan through the trace twice; the first time merely calculates the maximum depth that needs to be drawn and the second time actually generates the objects. 222 | 223 | function render(A::AccessArray) 224 | #Scan to find maximum depth 225 | olast = depth = 0 226 | for y in A.history 227 | (any(y[1] .≤ olast)) && (depth += 1) 228 | olast = maximum(y[2]) 229 | end 230 | maxdepth = depth 231 | 232 | olast = depth = 0 233 | C = [] 234 | for y in A.history 235 | (any(y[1] .≤ olast)) && (depth += 1) 236 | push!(C, render(Gate(y...), length(A), maxdepth, depth)) 237 | olast = maximum(y[2]) 238 | end 239 | 240 | push!(C, compose(context(units=UnitBox(0.5,0,length(A),1)), 241 | [line([(i,0), (i,1)]) for i=1:length(A)]..., 242 | linewidth(0.1mm), stroke(colorant"grey"))) 243 | compose(context(), C...) 244 | end 245 | 246 | #- 247 | 248 | render(prefix!(+, AccessArray(zeros(8)))) 249 | 250 | # Now we can see that `prefix!` rearranges the operations to form two spanning trees: 251 | # Try changing the number of elements! 252 | 253 | render(prefix!(+, AccessArray(zeros(16)))) 254 | 255 | # as contrasted with the serial code: 256 | 257 | render(prefix_serial!(+, AccessArray(zeros(8)))) 258 | 259 | # # Now exploit the parallelism in the _algorithm_ to use a parallel _implementation_ 260 | 261 | using .Threads 262 | function prefix_threads!(⊕, y) 263 | l=length(y) 264 | k=ceil(Int, log2(l)) 265 | for j=1:k 266 | @threads for i=2^j:2^j:min(l, 2^k) #"reduce" 267 | @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 268 | end 269 | end 270 | for j=(k-1):-1:1 271 | @threads for i=3*2^(j-1):2^j:min(l, 2^k) #"expand" 272 | @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 273 | end 274 | end 275 | y 276 | end 277 | 278 | A = rand(500_000); 279 | 280 | using BenchmarkTools 281 | @btime prefix_serial!(+, $(copy(A))); 282 | @btime prefix!(+, $(copy(A))); 283 | @btime prefix_threads!(+, $(copy(A))); 284 | 285 | prefix_threads!(+, copy(A)) == prefix!(+, copy(A)) ≈ cumsum(A) 286 | 287 | # # Thinking in parallel 288 | # 289 | # Notice how we didn't need to contort ourselves in making our algorithm 290 | # work with `@threads`. We really did _just_ take a `@threads` on it and it 291 | # just worked. It was both accurate _and_ fast. 292 | # 293 | # Coming up with rearrangements that make your particular algorithm parallel 294 | # friendly isn't always easy, but when possible it makes everything else 295 | # just fall out naturally. 296 | # 297 | # Finally, note that there can be clever ways to visualize algorithms as sanity checks. 298 | -------------------------------------------------------------------------------- /040 Multithreading.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # Multithreading 4 | # 5 | # Now we're finally ready to start talking about running things on multiple 6 | # processors! Most computers (even cell phones) these days have multiple cores 7 | # or processors — so the obvious place to start working with parallelism is 8 | # making use of those from within our Julia process. 9 | # 10 | # The first challenge, though, is knowing precisely how many "processors" you have. 11 | # "Processors" is in scare quotes because, well, it's complicated. 12 | 13 | versioninfo(verbose = true) 14 | 15 | #- 16 | 17 | using Hwloc 18 | Hwloc.num_physical_cores() 19 | 20 | # What your computer reports as the number of processors might not be the same 21 | # as the total number of "cores". While sometimes virtual processors can add 22 | # performance, parallelizing a typical numerical computation over these virtual 23 | # processors will lead to significantly worse performance because they still 24 | # have to share much of the nuts and bolts of the computation hardware. 25 | 26 | #- 27 | 28 | # Julia is somewhat multithreaded by default! BLAS calls (like matrix multiplication) are 29 | # already threaded: 30 | 31 | using BenchmarkTools 32 | A = rand(1000, 1000); 33 | B = rand(1000, 1000); 34 | @benchmark $A*$B 35 | 36 | # This is — by default — already using all your CPU cores! You can see the effect 37 | # by changing the number of threads (which BLAS supports doing dynamically): 38 | 39 | using LinearAlgebra 40 | BLAS.set_num_threads(1) 41 | @benchmark $A*$B 42 | BLAS.set_num_threads(4) 43 | @benchmark $A*$B 44 | 45 | # ## What does it look like to implement your _own_ threaded algorithm? 46 | 47 | using .Threads 48 | 49 | nthreads() 50 | 51 | # Julia currently needs to start up knowing that it has threading support enabled. 52 | # 53 | # You do that with a environment variable. To get four threads, start Julia with: 54 | # 55 | # ``` 56 | # JULIA_NUM_THREADS=4 julia 57 | # ``` 58 | 59 | run(`env JULIA_NUM_THREADS=4 julia -E 'using .Threads; nthreads()'`) 60 | 61 | # The other way to do it is in JuliaPro itself: 62 | # 63 | # * Go to the Julia Menu -> Settings -> Number of Threads 64 | # * By default it'll choose a "good" number for you 65 | 66 | threadid() 67 | 68 | # So we're currently on thread 1. Of course a loop like this will 69 | # just set the first element to one a number of times: 70 | 71 | A = zeros(Int, nthreads()) 72 | for i in 1:nthreads() 73 | A[i] = threadid() 74 | end 75 | A 76 | 77 | # But if we prefix it with `@threads` then the loop body is divided between threads! 78 | 79 | @threads for i in 1:nthreads() 80 | A[i] = threadid() 81 | end 82 | A 83 | 84 | # So let's try implementing our first simple threaded algorithm — `sum`: 85 | 86 | function threaded_sum1(A) 87 | r = zero(eltype(A)) 88 | @threads for i in eachindex(A) 89 | @inbounds r += A[i] 90 | end 91 | return r 92 | end 93 | 94 | A = rand(10_000_000) 95 | threaded_sum1(A) 96 | @time threaded_sum1(A) 97 | 98 | #- 99 | 100 | sum(A) 101 | @time sum(A) 102 | 103 | # Whoa! What happened? Not only did we get the wrong answer, it was _slow_ to get it! 104 | 105 | function threaded_sum2(A) 106 | r = Atomic{eltype(A)}(zero(eltype(A))) 107 | @threads for i in eachindex(A) 108 | @inbounds atomic_add!(r, A[i]) 109 | end 110 | return r[] 111 | end 112 | 113 | threaded_sum2(A) 114 | @time threaded_sum2(A) 115 | 116 | # Alright! Now we got the correct answer (modulo some floating point associativity), 117 | # but it's still slower than just doing the simple thing on 1 core. 118 | 119 | threaded_sum2(A) ≈ sum(A) 120 | 121 | # But it's still slow! Using atomics is much slower than just adding integers 122 | # because we constantly have to go and check _which_ processor has the latest 123 | # work! Also remember that each thread is running on its own processor — and 124 | # that processor also supports SIMD! Well, that is if it didn't need to worry 125 | # about syncing up with the other processors... 126 | 127 | function threaded_sum3(A) 128 | r = Atomic{eltype(A)}(zero(eltype(A))) 129 | len, rem = divrem(length(A), nthreads()) 130 | @threads for t in 1:nthreads() 131 | rₜ = zero(eltype(A)) 132 | @simd for i in (1:len) .+ (t-1)*len 133 | @inbounds rₜ += A[i] 134 | end 135 | atomic_add!(r, rₜ) 136 | end 137 | # catch up any stragglers 138 | result = r[] 139 | @simd for i in length(A)-rem+1:length(A) 140 | @inbounds result += A[i] 141 | end 142 | return result 143 | end 144 | 145 | threaded_sum3(A) 146 | @time threaded_sum3(A) 147 | 148 | # Dang, that's complicated. There's also a problem: 149 | 150 | threaded_sum3(rand(10) .+ rand(10)im) # try an array of complex numbers! 151 | 152 | # Isn't there an easier way? 153 | 154 | R = zeros(eltype(A), nthreads()) 155 | 156 | #- 157 | 158 | function threaded_sum4(A) 159 | R = zeros(eltype(A), nthreads()) 160 | @threads for i in eachindex(A) 161 | @inbounds R[threadid()] += A[i] 162 | end 163 | r = zero(eltype(A)) 164 | # sum the partial results from each thread 165 | for i in eachindex(R) 166 | @inbounds r += R[i] 167 | end 168 | return r 169 | end 170 | 171 | threaded_sum4(A) 172 | @time threaded_sum4(A) 173 | 174 | # This sacrifices our ability to `@simd` so it's a little slower, but at least we don't need to worry 175 | # about all those indices! And we also don't need to worry about atomics and 176 | # can again support arrays of any elements: 177 | 178 | threaded_sum4(rand(10) .+ rand(10)im) 179 | 180 | # ## Key takeaways from `threaded_sum`: 181 | # 182 | # * Beware shared state across threads — it may lead to wrong answers! 183 | # * Protect yourself by using atomics (or [locks/mutexes](https://docs.julialang.org/en/v1/base/multi-threading/#Synchronization-Primitives-1)) 184 | # * Better yet: divide up the work manually such that the inner loops don't 185 | # share state. `@threads for i in 1:nthreads()` is a handy idiom. 186 | # * Alternatively, just use an array and only access a single thread's elements 187 | 188 | #- 189 | 190 | # # Beware of global state (even if it's not obvious!) 191 | # 192 | # Another class of algorithm that you may want to parallelize is a monte-carlo 193 | # problem. Since each iteration is a new random draw, and since you're interested 194 | # in looking at the aggregate result, this seems like it should lend itself to 195 | # parallelism quite nicely! 196 | 197 | using BenchmarkTools 198 | 199 | #- 200 | 201 | function serialpi(n) 202 | inside = 0 203 | for i in 1:n 204 | x, y = rand(), rand() 205 | inside += (x^2 + y^2 <= 1) 206 | end 207 | return 4 * inside / n 208 | end 209 | serialpi(1) 210 | @time serialpi(100_000_000) 211 | 212 | 213 | # Let's use the techniques we learned above to make a fast threaded implementation: 214 | 215 | function threadedpi(n) 216 | inside = zeros(Int, nthreads()) 217 | @threads for i in 1:n 218 | x, y = rand(), rand() 219 | @inbounds inside[threadid()] += (x^2 + y^2 <= 1) 220 | end 221 | return 4 * sum(inside) / n 222 | end 223 | threadedpi(240) 224 | @time threadedpi(100_000_000) 225 | 226 | # Ok, now why didn't that work? It's slow! Let's look at the sequence of random 227 | # numbers that we generate: 228 | 229 | import Random 230 | Random.seed!(0) 231 | N = 20000 232 | Rserial = zeros(N) 233 | for i in 1:N 234 | Rserial[i] = rand() 235 | end 236 | Rserial 237 | 238 | #- 239 | 240 | Random.seed!(0) 241 | Rthreaded = zeros(N) 242 | @threads for i in 1:N 243 | Rthreaded[i] = rand() 244 | end 245 | Rthreaded 246 | 247 | #- 248 | 249 | Set(Rserial) == Set(Rthreaded) 250 | 251 | # Aha, `rand()` isn't (currently) threadsafe! It's mutating (and reading) some global each 252 | # time to figure out what to get next. This leads to slowdowns — and worse — it 253 | # skews the generated distribution of random numbers since some are repeated!! 254 | # 255 | # Note: on the upcoming Julia 1.3 it is now threadsafe by default! Here's how 256 | # we can emulate it on prior versions: 257 | 258 | const ThreadRNG = Vector{Random.MersenneTwister}(undef, nthreads()) 259 | @threads for i in 1:nthreads() 260 | ThreadRNG[Threads.threadid()] = Random.MersenneTwister() 261 | end 262 | function threadedpi2(n) 263 | inside = zeros(Int, nthreads()) 264 | len, rem = divrem(n, nthreads()) 265 | rem == 0 || error("use a multiple of $(nthreads()), please!") 266 | @threads for i in 1:nthreads() 267 | rng = ThreadRNG[threadid()] 268 | v = 0 269 | for j in 1:len 270 | x, y = rand(rng), rand(rng) 271 | v += (x^2 + y^2 <= 1) 272 | end 273 | inside[threadid()] = v 274 | end 275 | return 4 * sum(inside) / n 276 | end 277 | threadedpi2(240) 278 | @time threadedpi2(100_000_000) 279 | 280 | # As an aside, be careful about initializing many `MersenneTwister`s with 281 | # different states. Better to use [`randjump`](https://docs.julialang.org/en/v1/manual/parallel-computing/#Side-effects-and-mutable-function-arguments-1) to skip ahead for a single state. 282 | 283 | #- 284 | 285 | # # Beware oversubscription 286 | # 287 | # Remember how BLAS is threaded by default? What happens if we try to `@threads` 288 | # something that uses BLAS? 289 | 290 | Ms = [rand(1000, 1000) for _ in 1:100] 291 | function serial_matmul(As) 292 | first_idxs = zeros(length(As)) 293 | for i in eachindex(As) 294 | @inbounds first_idxs[i] = (As[i]'*As[i])[1] 295 | end 296 | first_idxs 297 | end 298 | serial_matmul(Ms[1:1]); 299 | @time serial_matmul(Ms); 300 | 301 | #- 302 | 303 | using LinearAlgebra 304 | BLAS.set_num_threads(nthreads()) # Explicitly tell BLAS to use the same number of threads 305 | function threaded_matmul(As) 306 | first_idxs = zeros(length(As)) 307 | @threads for i in eachindex(As) 308 | @inbounds first_idxs[i] = (As[i]'*As[i])[1] 309 | end 310 | first_idxs 311 | end 312 | threaded_matmul(Ms[1:1]) 313 | @time threaded_matmul(Ms); 314 | 315 | #- 316 | 317 | BLAS.set_num_threads(1) 318 | @time threaded_matmul(Ms); 319 | 320 | #- 321 | 322 | @time serial_matmul(Ms) # Again, now that BLAS has just 1 thread 323 | 324 | # # Beware "false sharing" 325 | 326 | #- 327 | 328 | # Remember the memory latency table? 329 | # 330 | # 331 | # | System Event | Actual Latency | Scaled Latency | | 332 | # | ------------------------------ | -------------- | -------------- | ------------------------ | 333 | # | One CPU cycle | 0.4 ns | 1 s | ← work happens here | 334 | # | Level 1 cache access | 0.9 ns | 2 s | | 335 | # | Level 2 cache access | 2.8 ns | 7 s | | 336 | # | Level 3 cache access | 28 ns | 1 min | | 337 | # | Main memory access (DDR DIMM) | ~100 ns | 4 min | ← we have control here | 338 | # 339 | # This is what a typical modern cpu looks like: 340 | # 341 | # ![Intel Core i7](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/i7.jpg) 342 | # 343 | # Multiple cores on the same processor share the L3 cache, but do not share L1 and L2 caches! So what happens if we're accessing and mutating data from the same array across multiple cores? 344 | # 345 | # ![Cache coherency](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/false-sharing.gif) 346 | # 347 | # Unlike "true" sharing — which we saw above — false sharing will still return the correct answer! But it does so at the cost of performance. The cores recognize they don't have exclusive access to the cache line and so upon modification they alert all other cores to invalidate and re-fetch the data. 348 | 349 | function test(spacing) 350 | a = zeros(Threads.nthreads()*spacing) 351 | b = rand(1000000) 352 | calls = zeros(Threads.nthreads()*spacing) 353 | @threads for i in eachindex(b) 354 | a[Threads.threadid()*spacing] += b[i] 355 | calls[Threads.threadid()*spacing] += 1 356 | end 357 | a, calls 358 | end 359 | @benchmark test(1); 360 | @benchmark test(8); 361 | 362 | #- 363 | 364 | # ## Further improvements coming here! 365 | # 366 | # PARTR — the threading improvement I discussed at the beginning aims to address 367 | # this problem of having library functions implemented with `@threads` and then 368 | # having callers call them with `@threads`. Uses a state-of-the-art work queue 369 | # mechanism to make sure that all threads stay busy. 370 | 371 | #- 372 | 373 | # # Threading takeaways: 374 | # 375 | # * It's easy! Just start Julia with `JULIA_NUM_THREADS` and tack a `@threads` on your loop 376 | # * Well, not so fast 377 | # * Be aware of your hardware to set `JULIA_NUM_THREADS` appropiately 378 | # * Beware shared state (for both performance and correctness) 379 | # * Beware global state (but the built-in global state is improving!) 380 | # * Beware false sharing (especially with multiple processor chips) 381 | # * We need to think carefully about how to design parallel algorithms! 382 | -------------------------------------------------------------------------------- /070 Distributed.jl: -------------------------------------------------------------------------------- 1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() 2 | 3 | # # Distributed (or multi-core or multi-process) parallelism 4 | # 5 | # Julia has a built-in standard library — Distributed — that allows you to 6 | # start and run multiple concurrent Julia processes. Imagine starting a slew 7 | # of Julia instances and then having an easy way to run code on each and every 8 | # one of them; that's what Distributed provides. 9 | # 10 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/Julia6x.png) 11 | 12 | using Distributed 13 | nprocs() 14 | 15 | import Hwloc 16 | n = Hwloc.num_physical_cores() 17 | 18 | #- 19 | 20 | addprocs(n, exeflags=`--project=$@__DIR__`) 21 | nprocs() 22 | 23 | #- 24 | 25 | myid() 26 | 27 | # Now we can easily communicate with the other nodes: 28 | 29 | r = @spawnat 2 (myid(), rand()) 30 | 31 | #- 32 | 33 | fetch(r) 34 | 35 | # This works kinda like an `@async` task! 36 | 37 | @time r = @spawnat 2 (sleep(1), rand()) 38 | @time fetch(r) 39 | 40 | # So we can repeat the same examples from tasks: 41 | 42 | @time for w in workers() 43 | @spawnat w sleep(1) 44 | end 45 | 46 | #- 47 | 48 | @time @sync for w in workers() 49 | @spawnat w sleep(1) 50 | end 51 | 52 | # Except unlike tasks, we're executing the code on a separate process — which 53 | # can be performed on a different processor in parallel! 54 | 55 | @everywhere function work(N) 56 | series = 1.0 57 | for i in 1:N 58 | series += (isodd(i) ? -1 : 1) / (i*2+1) 59 | end 60 | return 4*series 61 | end 62 | 63 | #- 64 | 65 | @time work(1_000_000_000) 66 | @time @sync for i in workers() 67 | @spawnat i work(1_000_000_000) 68 | end 69 | 70 | # Of course, this isn't very helpful. We're just performing exactly the same 71 | # calculation on every worker... and then completely ignoring the result! Let's 72 | # restructure our computation to be a bit more parallel friendly: 73 | 74 | @everywhere function partial_pi(r) 75 | series = 0.0 76 | for i in r 77 | series += (isodd(i) ? -1 : 1) / (i*2+1) 78 | end 79 | return 4*series 80 | end 81 | a = partial_pi(0:999) 82 | a, a-pi 83 | 84 | #- 85 | 86 | b = partial_pi(1000:9999) 87 | (a + b), (a+b) - pi 88 | 89 | # So now we can distribute this computation across our many workers! 90 | 91 | r = 0:1_000_000_000 92 | futures = Array{Future}(undef, nworkers()) 93 | @time begin 94 | for (i, id) in enumerate(workers()) 95 | batch = 0:length(r)÷nworkers()-1 96 | futures[i] = @spawnat id partial_pi(batch .+ (i-1)*(length(r)÷nworkers())) 97 | end 98 | p = sum(fetch.(futures)) 99 | end 100 | p - pi 101 | 102 | # But that's rather annoying — needing to carefully divide up our workflow and 103 | # manually collect all our results and such. There's an easier way: 104 | 105 | @time p = @distributed (+) for r in [(0:9999) .+ offset for offset in 0:10000:r[end]-1] 106 | partial_pi(r) 107 | end 108 | p - pi 109 | 110 | # Why is this different from `@threads for` and `@simd for`? Why not just 111 | # `@distributed for`? Why the `@distributed (+) for`? 112 | 113 | #- 114 | 115 | # ## Data movement 116 | 117 | #- 118 | 119 | # Remember: Moving data is _expensive_! 120 | # 121 | # | System Event | Actual Latency | Scaled Latency | 122 | # | ------------------------------ | -------------- | -------------- | 123 | # | One CPU cycle | 0.4 ns | 1 s | 124 | # | Level 1 cache access | 0.9 ns | 2 s | 125 | # | Level 2 cache access | 2.8 ns | 7 s | 126 | # | Level 3 cache access | 28 ns | 1 min | 127 | # | Main memory access (DDR DIMM) | ~100 ns | 4 min | 128 | # | Intel Optane memory access | <10 μs | 7 hrs | 129 | # | NVMe SSD I/O | ~25 μs | 17 hrs | 130 | # | SSD I/O | 50–150 μs | 1.5–4 days | 131 | # | Rotational disk I/O | 1–10 ms | 1–9 months | 132 | # | Internet call: SF to NYC | 65 ms | 5 years | 133 | # | Internet call: SF to Hong Kong | 141 ms | 11 years | 134 | # 135 | # You really don't want to be taking a trip to the moon very frequently. 136 | # Communication between processes can indeed be as expensive as hitting a disk — 137 | # sometimes they're even implemented that way. 138 | # 139 | # So that's why Julia has special support for reductions built in to the 140 | # `@distributed` macro: each worker can do its own (intermediate) reduction 141 | # before returning just one value to our master node. 142 | 143 | #- 144 | 145 | # But sometimes you need to see those intermediate values. If you have a 146 | # very expensive computation relative to the communication overhead, there are 147 | # several ways to do this. The easiest is `pmap`: 148 | 149 | @time pmap(partial_pi, [(0:99999) .+ offset for offset in 0:100000:r[end]-1]) 150 | 151 | # But if we have a large computation relative to the number of return values, 152 | # pmap is great and easy. 153 | # 154 | # Increase the work on each worker by 100x and reduce the amount of communication by 100x: 155 | 156 | @time pmap(partial_pi, [(0:9999999) .+ offset for offset in 0:10000000:r[end]-1]) 157 | 158 | # There are other ways of doing this, though, too — we'll get to them in a minute. 159 | # But first, there's something else that I glossed over: the `@everywhere`s above. 160 | 161 | #- 162 | 163 | # ## Code movement 164 | 165 | #- 166 | 167 | # Each node is _completely_ independent; it's like starting brand new, separate 168 | # Julia processes yourself. By default, `addprocs()` just launches the 169 | # appropriate number of workers for the current workstation that you're on, but 170 | # you can easily connect them to remote machines via SSH or even through cluster 171 | # managers. 172 | 173 | #- 174 | 175 | # Those `@everywhere`s above are very important! They run the given expression 176 | # on all workers to make sure the state between them is consistent. Without it, 177 | # you'll see errors like this: 178 | 179 | hello() = "hello world" 180 | r = @spawnat 2 hello() 181 | 182 | fetch(r) 183 | 184 | # Note that this applies to packages, too! 185 | 186 | using Statistics # The Statistics stdlib defines mean 187 | fetch(@spawnat 2 mean(rand(100_000))) 188 | 189 | #- 190 | 191 | @everywhere using Statistics 192 | fetch(@spawnat 2 mean(rand(100_000))) 193 | 194 | # # Other ways to structure and/or share data between processes 195 | # 196 | # Unlike `@threads`, we no longer have access to the same memory. While this 197 | # does make expressing some algorithms a little more tricky, the "default" 198 | # is much safer! There isn't any shared state to begin with, so it's harder 199 | # to write an incorrect algorithm. It's also just harder to write some 200 | # algorithms in the first place. 201 | # 202 | # So there are some special array types that can help bridge the gap between 203 | # processes and make writing parallel code a bit easier. 204 | 205 | #- 206 | 207 | # ## The `SharedArray` 208 | # 209 | # If all workers are on the same physical machine, while they cannot share 210 | # memory, they do all have shared access to the same hard drive(s)! 211 | # 212 | # The `SharedArray` makes use of this fact, allowing concurrent accesses to the 213 | # same array — somewhat akin to threads default state. 214 | # 215 | # This is the prefix definition from the "thinking in parallel" course: 216 | # 217 | # ``` 218 | # using .Threads 219 | # function prefix_threads!(y, ⊕) 220 | # l=length(y) 221 | # k=ceil(Int, log2(l)) 222 | # for j=1:k 223 | # @threads for i=2^j:2^j:min(l, 2^k) #"reduce" 224 | # @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 225 | # end 226 | # end 227 | # for j=(k-1):-1:1 228 | # @threads for i=3*2^(j-1):2^j:min(l, 2^k) #"expand" 229 | # @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 230 | # end 231 | # end 232 | # y 233 | # end 234 | # ``` 235 | 236 | using SharedArrays 237 | function prefix!(⊕, y::SharedArray) 238 | l=length(y) 239 | k=ceil(Int, log2(l)) 240 | for j=1:k 241 | @distributed for i=2^j:2^j:min(l, 2^k) #"reduce" 242 | @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 243 | end 244 | end 245 | for j=(k-1):-1:1 246 | @distributed for i=3*2^(j-1):2^j:min(l, 2^k) #"expand" 247 | @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 248 | end 249 | end 250 | y 251 | end 252 | data = rand(1_000_000); 253 | A = SharedArray(data); 254 | 255 | #- 256 | 257 | prefix!(+, copy(A)) # compile 258 | @time prefix!(+, A); 259 | 260 | #- 261 | 262 | A ≈ cumsum(data) 263 | 264 | # What went wrong? 265 | 266 | function prefix!(⊕, y::SharedArray) 267 | l=length(y) 268 | k=ceil(Int, log2(l)) 269 | for j=1:k 270 | @sync @distributed for i=2^j:2^j:min(l, 2^k) #"reduce" 271 | @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 272 | end 273 | end 274 | for j=(k-1):-1:1 275 | @sync @distributed for i=3*2^(j-1):2^j:min(l, 2^k) #"expand" 276 | @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i] 277 | end 278 | end 279 | y 280 | end 281 | A = SharedArray(data) 282 | @time prefix!(+, A) 283 | 284 | #- 285 | 286 | A ≈ cumsum(data) 287 | 288 | # ## DistributedArrays 289 | # 290 | # We can, though, turn the problem on its head and allow the _data_ itself 291 | # to determine how the problem gets split up. This can save us tons of indexing 292 | # headaches. 293 | 294 | @everywhere using Distributed 295 | using DistributedArrays 296 | @everywhere using DistributedArrays 297 | A = DArray(I->fill(myid(), length.(I)), (24, 24)) 298 | 299 | # The first argument takes a function that transforms the given set of indices 300 | # to the _local portion_ of the distributed array. 301 | 302 | A = DArray((24,24)) do I 303 | @show I 304 | fill(myid(), length.(I)) 305 | end 306 | 307 | # Notice that none of the array actually lives on processor 1, but we can still 308 | # display the contents — when we do we're requesting all workers give us their 309 | # current data! While we've only talked about master-worker communcation so far, 310 | # workers can communicate directly amongst themselves, too (by default). 311 | 312 | using BenchmarkTools 313 | @everywhere using BenchmarkTools 314 | fetch(@spawnat 2 @benchmark $A[1,1]) 315 | 316 | #- 317 | 318 | fetch(@spawnat 2 @benchmark $A[end,end]) 319 | 320 | # So it's fastest to work on a `DArray`'s "local" portion, but it's _possible_ 321 | # to grab other data if need be. This is perfect for any sort of tiled operation 322 | # that works on neighboring values (like image filtering/convolution). Or Conway's 323 | # game of life! 324 | 325 | function life_step(d::DArray) 326 | DArray(size(d),procs(d)) do I 327 | # Compute the indices of the outside edge (that will come from other processors) 328 | top = mod1(first(I[1])-1,size(d,1)) 329 | bot = mod1( last(I[1])+1,size(d,1)) 330 | left = mod1(first(I[2])-1,size(d,2)) 331 | right = mod1( last(I[2])+1,size(d,2)) 332 | # Create a new, temporary array that holds the local part + outside edge 333 | old = Array{Bool}(undef, length(I[1])+2, length(I[2])+2) 334 | # These accesses will pull data from other processors 335 | old[1 , 1 ] = d[top , left] 336 | old[2:end-1, 1 ] = d[I[1], left] # left side (and corners) 337 | old[end , 1 ] = d[bot , left] 338 | old[1 , end ] = d[top , right] 339 | old[2:end-1, end ] = d[I[1], right] # right side (and corners) 340 | old[end , end ] = d[bot , right] 341 | old[1 , 2:end-1] = d[top , I[2]] # top 342 | old[end , 2:end-1] = d[bot , I[2]] # bottom 343 | # But this big one is all local! 344 | old[2:end-1, 2:end-1] = d[I[1], I[2]] # middle 345 | 346 | life_rule(old) # Compute the new segment! 347 | end 348 | end 349 | @everywhere function life_rule(old) 350 | # Now this part — the computational part — is entirely local and on Arrays! 351 | m, n = size(old) 352 | new = similar(old, m-2, n-2) 353 | for j = 2:n-1 354 | @inbounds for i = 2:m-1 355 | nc = (+)(old[i-1,j-1], old[i-1,j], old[i-1,j+1], 356 | old[i ,j-1], old[i ,j+1], 357 | old[i+1,j-1], old[i+1,j], old[i+1,j+1]) 358 | new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j]) 359 | end 360 | end 361 | new 362 | end 363 | 364 | #- 365 | 366 | A = DArray(I->rand(Bool, length.(I)), (20,20)) 367 | @everywhere using Colors 368 | Gray.(A) 369 | 370 | #- 371 | 372 | B = copy(A) 373 | 374 | #- 375 | 376 | B = Gray.(life_step(B)) 377 | B 378 | #- 379 | 380 | # ## Clusters and more ways to distribute 381 | # 382 | # You can easily connect to completely separate machines with SSH access built in! 383 | # But there are many other ways to connect to clusters: 384 | # 385 | # * [JuliaRun](https://juliacomputing.com/products/juliarun) 386 | # * [Kubernetes](https://juliacomputing.com/blog/2018/12/15/kuber.html) 387 | # * [MPI](https://github.com/JuliaParallel/MPI.jl) 388 | # * [Cluster job queues with ClusterManagers](https://github.com/JuliaParallel/ClusterManagers.jl) 389 | # * [Hadoop](https://github.com/JuliaParallel/Elly.jl) 390 | # * [Spark](https://github.com/dfdx/Spark.jl) 391 | 392 | #- 393 | 394 | # # Multi-process parallelism is the heavy-duty workhorse in Julia 395 | # 396 | # It can tackle very large problems and distribute across a very large number 397 | # of workers. Key things to remember 398 | # 399 | # * Each worker is a completely independent Julia process 400 | # * Data must move to them 401 | # * Code must move to them 402 | # * Structure your algorithms and use a distributed mechanism that fits with the 403 | # time and memory parameters of your problem 404 | # * `@distributed` can be good for reductions and even relatively fast inner loops with limited (or no) explicit data transfer 405 | # * `pmap` is great for very expensive inner loops that return a value 406 | # * `SharedArray`s can be an easier drop-in replacement for threading-like behaviors (on a single machine) 407 | # * `DistributedArray`s can turn the problem on its head and let the data do the work splitting! 408 | -------------------------------------------------------------------------------- /Manifest.toml: -------------------------------------------------------------------------------- 1 | # This file is machine-generated - editing it directly is not advised 2 | 3 | [[AbstractFFTs]] 4 | deps = ["LinearAlgebra"] 5 | git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40" 6 | uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" 7 | version = "0.4.1" 8 | 9 | [[AbstractTrees]] 10 | deps = ["Markdown", "Test"] 11 | git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b" 12 | uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" 13 | version = "0.2.1" 14 | 15 | [[Adapt]] 16 | deps = ["LinearAlgebra", "Test"] 17 | git-tree-sha1 = "53d8fec4f662088c1202530e338a11a919407f3b" 18 | uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 19 | version = "0.4.2" 20 | 21 | [[Arpack]] 22 | deps = ["BinaryProvider", "Libdl", "LinearAlgebra"] 23 | git-tree-sha1 = "07a2c077bdd4b6d23a40342a8a108e2ee5e58ab6" 24 | uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" 25 | version = "0.3.1" 26 | 27 | [[AxisAlgorithms]] 28 | deps = ["LinearAlgebra", "Random", "SparseArrays", "WoodburyMatrices"] 29 | git-tree-sha1 = "a4d07a1c313392a77042855df46c5f534076fab9" 30 | uuid = "13072b0f-2c55-5437-9ae7-d433b7a33950" 31 | version = "1.0.0" 32 | 33 | [[Base64]] 34 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" 35 | 36 | [[BenchmarkTools]] 37 | deps = ["JSON", "Printf", "Statistics", "Test"] 38 | git-tree-sha1 = "5d1dd8577643ba9014574cd40d9c028cd5e4b85a" 39 | uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 40 | version = "0.4.2" 41 | 42 | [[BinDeps]] 43 | deps = ["Compat", "Libdl", "SHA", "URIParser"] 44 | git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9" 45 | uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee" 46 | version = "0.8.10" 47 | 48 | [[BinaryProvider]] 49 | deps = ["Compat", "CredentialsHandler", "Libdl", "Pkg", "SHA", "TOML", "Test"] 50 | uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" 51 | 52 | [[CSTParser]] 53 | deps = ["Tokenize"] 54 | git-tree-sha1 = "376a39f1862000442011390f1edf5e7f4dcc7142" 55 | uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f" 56 | version = "0.6.0" 57 | 58 | [[CUDAapi]] 59 | deps = ["Libdl", "Logging", "Test"] 60 | git-tree-sha1 = "125122309a4387e0d18787cef0f03800fa57702a" 61 | uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" 62 | version = "0.6.3" 63 | 64 | [[CUDAdrv]] 65 | deps = ["CUDAapi", "Libdl", "Printf"] 66 | git-tree-sha1 = "9b2d99981b984378799ec70dd005cb7e7b4e914c" 67 | uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" 68 | version = "3.0.1" 69 | 70 | [[CUDAnative]] 71 | deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"] 72 | git-tree-sha1 = "36cbb94f74cd3e5db774134a68dc5d033ae2c87e" 73 | uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" 74 | version = "2.2.1" 75 | 76 | [[Calculus]] 77 | deps = ["Compat"] 78 | git-tree-sha1 = "f60954495a7afcee4136f78d1d60350abd37a409" 79 | uuid = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9" 80 | version = "0.4.1" 81 | 82 | [[CategoricalArrays]] 83 | deps = ["Compat", "Future", "JSON", "Missings", "Printf", "Reexport"] 84 | git-tree-sha1 = "26601961df6afacdd16d67c1eec6cfe75e5ae9ab" 85 | uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" 86 | version = "0.5.4" 87 | 88 | [[CodecZlib]] 89 | deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"] 90 | git-tree-sha1 = "36bbf5374c661054d41410dc53ff752972583b9b" 91 | uuid = "944b1d66-785c-5afd-91f1-9de20f533193" 92 | version = "0.5.2" 93 | 94 | [[ColorTypes]] 95 | deps = ["FixedPointNumbers", "Random"] 96 | git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965" 97 | uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" 98 | version = "0.8.0" 99 | 100 | [[Colors]] 101 | deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"] 102 | git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543" 103 | uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" 104 | version = "0.9.5" 105 | 106 | [[CommonSubexpressions]] 107 | deps = ["Test"] 108 | git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0" 109 | uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" 110 | version = "0.2.0" 111 | 112 | [[Compat]] 113 | deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] 114 | git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f" 115 | uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" 116 | version = "2.1.0" 117 | 118 | [[Compose]] 119 | deps = ["Base64", "Colors", "DataStructures", "Dates", "IterTools", "JSON", "LinearAlgebra", "Measures", "Printf", "Random", "Requires", "Test", "UUIDs"] 120 | git-tree-sha1 = "7d8fe0ad6f73c40ccc4e01f426a700c5a843a1d3" 121 | uuid = "a81c6b42-2e10-5240-aca2-a61377ecd94b" 122 | version = "0.7.3" 123 | 124 | [[Conda]] 125 | deps = ["JSON", "VersionParsing"] 126 | git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032" 127 | uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" 128 | version = "1.3.0" 129 | 130 | [[Contour]] 131 | deps = ["LinearAlgebra", "StaticArrays", "Test"] 132 | git-tree-sha1 = "b974e164358fea753ef853ce7bad97afec15bb80" 133 | uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" 134 | version = "0.5.1" 135 | 136 | [[CoupledFields]] 137 | deps = ["Compat", "StatsBase"] 138 | git-tree-sha1 = "d56f26542bb7af9c0ec16e098a0a33352f3c9d8e" 139 | uuid = "7ad07ef1-bdf2-5661-9d2b-286fd4296dac" 140 | version = "0.1.0" 141 | 142 | [[Crayons]] 143 | deps = ["Test"] 144 | git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523" 145 | uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" 146 | version = "4.0.0" 147 | 148 | [[CredentialsHandler]] 149 | deps = ["Base64", "HTTP", "TOML"] 150 | uuid = "864e158e-919d-11e8-198e-cfe890ec4681" 151 | 152 | [[CuArrays]] 153 | deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"] 154 | git-tree-sha1 = "f95cbe4fe78b1fff00691aa1d2e533542f095358" 155 | uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" 156 | version = "1.0.2" 157 | 158 | [[DataStructures]] 159 | deps = ["InteractiveUtils", "OrderedCollections"] 160 | git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a" 161 | uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" 162 | version = "0.17.0" 163 | 164 | [[Dates]] 165 | deps = ["Printf"] 166 | uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" 167 | 168 | [[DelimitedFiles]] 169 | deps = ["Mmap"] 170 | uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" 171 | 172 | [[DiffEqDiffTools]] 173 | deps = ["LinearAlgebra", "SparseArrays", "StaticArrays"] 174 | git-tree-sha1 = "b992345a39b4d9681342ae795a8dacc100730182" 175 | uuid = "01453d9d-ee7c-5054-8395-0335cb756afa" 176 | version = "0.14.0" 177 | 178 | [[DiffResults]] 179 | deps = ["Compat", "StaticArrays"] 180 | git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c" 181 | uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" 182 | version = "0.0.4" 183 | 184 | [[DiffRules]] 185 | deps = ["Random", "Test"] 186 | git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7" 187 | uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" 188 | version = "0.0.10" 189 | 190 | [[Distances]] 191 | deps = ["LinearAlgebra", "Printf", "Random", "Statistics", "Test"] 192 | git-tree-sha1 = "a135c7c062023051953141da8437ed74f89d767a" 193 | uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" 194 | version = "0.8.0" 195 | 196 | [[Distributed]] 197 | deps = ["Random", "Serialization", "Sockets"] 198 | uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" 199 | 200 | [[DistributedArrays]] 201 | deps = ["Distributed", "LinearAlgebra", "Primes", "Random", "Serialization", "SparseArrays", "Statistics"] 202 | git-tree-sha1 = "9b4689b8d49b42351d518431ff642ed29cedd6d4" 203 | uuid = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94" 204 | version = "0.6.2" 205 | 206 | [[Distributions]] 207 | deps = ["LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] 208 | git-tree-sha1 = "56a158bc0abe4af5d4027af2275fde484261ca6d" 209 | uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" 210 | version = "0.19.2" 211 | 212 | [[DocStringExtensions]] 213 | deps = ["LibGit2", "Markdown", "Pkg", "Test"] 214 | git-tree-sha1 = "0513f1a8991e9d83255e0140aace0d0fc4486600" 215 | uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" 216 | version = "0.8.0" 217 | 218 | [[FFTW]] 219 | deps = ["AbstractFFTs", "BinaryProvider", "Compat", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"] 220 | git-tree-sha1 = "29cda58afbf62f35b1a094882ad6c745a47b2eaa" 221 | uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" 222 | version = "0.2.4" 223 | 224 | [[FileWatching]] 225 | uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" 226 | 227 | [[FillArrays]] 228 | deps = ["LinearAlgebra", "Random", "SparseArrays", "Test"] 229 | git-tree-sha1 = "9ab8f76758cbabba8d7f103c51dce7f73fcf8e92" 230 | uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" 231 | version = "0.6.3" 232 | 233 | [[FixedPointNumbers]] 234 | git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b" 235 | uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" 236 | version = "0.6.1" 237 | 238 | [[Flux]] 239 | deps = ["AbstractTrees", "Adapt", "CodecZlib", "Colors", "DelimitedFiles", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SHA", "Statistics", "StatsBase", "Tracker", "ZipFile"] 240 | git-tree-sha1 = "08212989c2856f95f90709ea5fd824bd27b34514" 241 | uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" 242 | version = "0.8.3" 243 | 244 | [[ForwardDiff]] 245 | deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"] 246 | git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b" 247 | uuid = "f6369f11-7733-5829-9624-2563aa707210" 248 | version = "0.10.3" 249 | 250 | [[Future]] 251 | deps = ["Random"] 252 | uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" 253 | 254 | [[GPUArrays]] 255 | deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"] 256 | git-tree-sha1 = "6b556af6e42b71f5712a98f8df3d110a76bfdea9" 257 | uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" 258 | version = "0.7.2" 259 | 260 | [[Gadfly]] 261 | deps = ["Base64", "CategoricalArrays", "Colors", "Compat", "Compose", "Contour", "CoupledFields", "DataStructures", "Dates", "Distributions", "DocStringExtensions", "Hexagons", "IndirectArrays", "IterTools", "JSON", "Juno", "KernelDensity", "LibGit2", "LinearAlgebra", "Loess", "Measures", "Printf", "Random", "Requires", "Showoff", "Statistics", "StatsBase", "Test"] 262 | git-tree-sha1 = "ee709588c71eb62ce53cecf5a594bdefd6f2a9be" 263 | uuid = "c91e804a-d5a3-530f-b6f0-dfbca275c004" 264 | version = "1.0.1" 265 | 266 | [[HTTP]] 267 | deps = ["Base64", "Dates", "Distributed", "IniFile", "JSON", "MbedTLS", "Sockets", "Test"] 268 | uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" 269 | 270 | [[Hexagons]] 271 | deps = ["Test"] 272 | git-tree-sha1 = "de4a6f9e7c4710ced6838ca906f81905f7385fd6" 273 | uuid = "a1b4810d-1bce-5fbd-ac56-80944d57a21f" 274 | version = "0.2.0" 275 | 276 | [[Hwloc]] 277 | deps = ["BinaryProvider", "Libdl"] 278 | git-tree-sha1 = "bb23d264d76b82d1da80733cbb01bad8a11ae489" 279 | uuid = "0e44f5e4-bd66-52a0-8798-143a42290a1d" 280 | version = "1.0.3" 281 | 282 | [[IndirectArrays]] 283 | deps = ["Compat", "Test"] 284 | git-tree-sha1 = "b6e249be10a3381b2c72ac82f2d13d70067cb2bd" 285 | uuid = "9b13fd28-a010-5f03-acff-a1bbcff69959" 286 | version = "0.5.0" 287 | 288 | [[IniFile]] 289 | uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" 290 | 291 | [[InteractiveUtils]] 292 | deps = ["Markdown"] 293 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 294 | 295 | [[Interpolations]] 296 | deps = ["AxisAlgorithms", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"] 297 | git-tree-sha1 = "e1bac96b5ef3ea23b50e801b4a988ec21861a47f" 298 | uuid = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" 299 | version = "0.12.2" 300 | 301 | [[IterTools]] 302 | git-tree-sha1 = "2ebe60d7343962966d1779a74a760f13217a6901" 303 | uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" 304 | version = "1.2.0" 305 | 306 | [[JSON]] 307 | deps = ["Dates", "Distributed", "Mmap", "Sockets", "Test", "Unicode"] 308 | git-tree-sha1 = "1f7a25b53ec67f5e9422f1f551ee216503f4a0fa" 309 | uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" 310 | version = "0.20.0" 311 | 312 | [[Juno]] 313 | deps = ["Base64", "Logging", "Media", "Profile", "Test"] 314 | git-tree-sha1 = "4e4a8d43aa7ecec66cadaf311fbd1e5c9d7b9175" 315 | uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" 316 | version = "0.7.0" 317 | 318 | [[KernelDensity]] 319 | deps = ["Distributions", "FFTW", "Interpolations", "Optim", "StatsBase", "Test"] 320 | git-tree-sha1 = "c1048817fe5711f699abc8fabd47b1ac6ba4db04" 321 | uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" 322 | version = "0.5.1" 323 | 324 | [[LLVM]] 325 | deps = ["Libdl", "Printf", "Unicode"] 326 | git-tree-sha1 = "7fafc370730b515a6273046a53cbb548ef3e38f7" 327 | uuid = "929cbde3-209d-540e-8aea-75f648917ca0" 328 | version = "1.1.1" 329 | 330 | [[LibGit2]] 331 | uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" 332 | 333 | [[Libdl]] 334 | uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" 335 | 336 | [[LineSearches]] 337 | deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf", "Test"] 338 | git-tree-sha1 = "54eb90e8dbe745d617c78dee1d6ae95c7f6f5779" 339 | uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" 340 | version = "7.0.1" 341 | 342 | [[LinearAlgebra]] 343 | deps = ["Libdl"] 344 | uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" 345 | 346 | [[Loess]] 347 | deps = ["Distances", "Random", "Statistics", "Test"] 348 | git-tree-sha1 = "0ee46caf683a422b595be4dfaed6cda28f541e25" 349 | uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612" 350 | version = "0.5.0" 351 | 352 | [[Logging]] 353 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" 354 | 355 | [[MacroTools]] 356 | deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"] 357 | git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76" 358 | uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" 359 | version = "0.5.1" 360 | 361 | [[Markdown]] 362 | deps = ["Base64"] 363 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" 364 | 365 | [[MbedTLS]] 366 | deps = ["BinaryProvider", "Dates", "Libdl", "Pkg", "Random", "Sockets"] 367 | uuid = "739be429-bea8-5141-9913-cc70e7f3736d" 368 | 369 | [[Measures]] 370 | deps = ["Test"] 371 | git-tree-sha1 = "ddfd6d13e330beacdde2c80de27c1c671945e7d9" 372 | uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" 373 | version = "0.3.0" 374 | 375 | [[Media]] 376 | deps = ["MacroTools", "Test"] 377 | git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" 378 | uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" 379 | version = "0.5.0" 380 | 381 | [[Missings]] 382 | deps = ["SparseArrays", "Test"] 383 | git-tree-sha1 = "f0719736664b4358aa9ec173077d4285775f8007" 384 | uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" 385 | version = "0.4.1" 386 | 387 | [[Mmap]] 388 | uuid = "a63ad114-7e13-5084-954f-fe012c677804" 389 | 390 | [[NLSolversBase]] 391 | deps = ["Calculus", "DiffEqDiffTools", "DiffResults", "Distributed", "ForwardDiff", "LinearAlgebra", "Random", "SparseArrays", "Test"] 392 | git-tree-sha1 = "0c6f0e7f2178f78239cfb75310359eed10f2cacb" 393 | uuid = "d41bc354-129a-5804-8e4c-c37616107c6c" 394 | version = "7.3.1" 395 | 396 | [[NNlib]] 397 | deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"] 398 | git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8" 399 | uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" 400 | version = "0.6.0" 401 | 402 | [[NaNMath]] 403 | deps = ["Compat"] 404 | git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2" 405 | uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" 406 | version = "0.3.2" 407 | 408 | [[OffsetArrays]] 409 | git-tree-sha1 = "1af2f79c7eaac3e019a0de41ef63335ff26a0a57" 410 | uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" 411 | version = "0.11.1" 412 | 413 | [[Optim]] 414 | deps = ["Calculus", "DiffEqDiffTools", "ForwardDiff", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "Random", "SparseArrays", "StatsBase", "Test"] 415 | git-tree-sha1 = "a626e09c1f7f019b8f3a30a8172c7b82d2f4810b" 416 | uuid = "429524aa-4258-5aef-a3af-852621145aeb" 417 | version = "0.18.1" 418 | 419 | [[OrderedCollections]] 420 | deps = ["Random", "Serialization", "Test"] 421 | git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" 422 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" 423 | version = "1.1.0" 424 | 425 | [[PDMats]] 426 | deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] 427 | git-tree-sha1 = "8b68513175b2dc4023a564cb0e917ce90e74fd69" 428 | uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" 429 | version = "0.9.7" 430 | 431 | [[Parameters]] 432 | deps = ["Markdown", "OrderedCollections", "REPL", "Test"] 433 | git-tree-sha1 = "70bdbfb2bceabb15345c0b54be4544813b3444e4" 434 | uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" 435 | version = "0.10.3" 436 | 437 | [[Pkg]] 438 | deps = ["BinaryProvider", "CredentialsHandler", "Dates", "HTTP", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "TOML", "UUIDs"] 439 | uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 440 | 441 | [[PositiveFactorizations]] 442 | deps = ["LinearAlgebra", "Test"] 443 | git-tree-sha1 = "957c3dd7c33895469760ce873082fbb6b3620641" 444 | uuid = "85a6dd25-e78a-55b7-8502-1745935b8125" 445 | version = "0.2.2" 446 | 447 | [[Primes]] 448 | deps = ["Test"] 449 | git-tree-sha1 = "ff1a2323cb468ec5f201838fcbe3c232266b1f95" 450 | uuid = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" 451 | version = "0.4.0" 452 | 453 | [[Printf]] 454 | deps = ["Unicode"] 455 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" 456 | 457 | [[Profile]] 458 | deps = ["Printf"] 459 | uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" 460 | 461 | [[PyCall]] 462 | deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Serialization", "Statistics", "Test", "VersionParsing"] 463 | git-tree-sha1 = "6e5bac1b1faf3575731a6a5b76f638f2389561d3" 464 | uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" 465 | version = "1.91.2" 466 | 467 | [[QuadGK]] 468 | deps = ["DataStructures", "LinearAlgebra", "Test"] 469 | git-tree-sha1 = "3ce467a8e76c6030d4c3786e7d3a73442017cdc0" 470 | uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" 471 | version = "2.0.3" 472 | 473 | [[REPL]] 474 | deps = ["InteractiveUtils", "Markdown", "Sockets"] 475 | uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" 476 | 477 | [[Random]] 478 | deps = ["Serialization"] 479 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 480 | 481 | [[Ratios]] 482 | deps = ["Compat"] 483 | git-tree-sha1 = "cdbbe0f350581296f3a2e3e7a91b214121934407" 484 | uuid = "c84ed2f1-dad5-54f0-aa8e-dbefe2724439" 485 | version = "0.3.1" 486 | 487 | [[Reexport]] 488 | deps = ["Pkg"] 489 | git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" 490 | uuid = "189a3867-3050-52da-a836-e630ba90ab69" 491 | version = "0.2.0" 492 | 493 | [[Requires]] 494 | deps = ["Test"] 495 | git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" 496 | uuid = "ae029012-a4dd-5104-9daa-d747884805df" 497 | version = "0.5.2" 498 | 499 | [[Rmath]] 500 | deps = ["BinaryProvider", "Libdl", "Random", "Statistics", "Test"] 501 | git-tree-sha1 = "9a6c758cdf73036c3239b0afbea790def1dabff9" 502 | uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" 503 | version = "0.5.0" 504 | 505 | [[SHA]] 506 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" 507 | 508 | [[Serialization]] 509 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" 510 | 511 | [[SharedArrays]] 512 | deps = ["Distributed", "Mmap", "Random", "Serialization"] 513 | uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" 514 | 515 | [[Showoff]] 516 | deps = ["Dates"] 517 | git-tree-sha1 = "e032c9df551fb23c9f98ae1064de074111b7bc39" 518 | uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" 519 | version = "0.3.1" 520 | 521 | [[Sockets]] 522 | uuid = "6462fe0b-24de-5631-8697-dd941f90decc" 523 | 524 | [[SortingAlgorithms]] 525 | deps = ["DataStructures", "Random", "Test"] 526 | git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" 527 | uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" 528 | version = "0.3.1" 529 | 530 | [[SparseArrays]] 531 | deps = ["LinearAlgebra", "Random"] 532 | uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" 533 | 534 | [[SpecialFunctions]] 535 | deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"] 536 | git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea" 537 | uuid = "276daf66-3868-5448-9aa4-cd146d93841b" 538 | version = "0.7.2" 539 | 540 | [[StaticArrays]] 541 | deps = ["LinearAlgebra", "Random", "Statistics"] 542 | git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6" 543 | uuid = "90137ffa-7385-5640-81b9-e52037218182" 544 | version = "0.11.0" 545 | 546 | [[Statistics]] 547 | deps = ["LinearAlgebra", "SparseArrays"] 548 | uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" 549 | 550 | [[StatsBase]] 551 | deps = ["DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] 552 | git-tree-sha1 = "2b6ca97be7ddfad5d9f16a13fe277d29f3d11c23" 553 | uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" 554 | version = "0.31.0" 555 | 556 | [[StatsFuns]] 557 | deps = ["Rmath", "SpecialFunctions", "Test"] 558 | git-tree-sha1 = "b3a4e86aa13c732b8a8c0ba0c3d3264f55e6bb3e" 559 | uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" 560 | version = "0.8.0" 561 | 562 | [[SuiteSparse]] 563 | deps = ["Libdl", "LinearAlgebra", "SparseArrays"] 564 | uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" 565 | 566 | [[TOML]] 567 | deps = ["Dates"] 568 | uuid = "9d418dce-91a8-11e8-0173-7b01a971d501" 569 | 570 | [[Test]] 571 | deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] 572 | uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 573 | 574 | [[TimerOutputs]] 575 | deps = ["Crayons", "Printf", "Test", "Unicode"] 576 | git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c" 577 | uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" 578 | version = "0.5.0" 579 | 580 | [[Tokenize]] 581 | git-tree-sha1 = "0de343efc07da00cd449d5b04e959ebaeeb3305d" 582 | uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624" 583 | version = "0.5.4" 584 | 585 | [[Tracker]] 586 | deps = ["Adapt", "DiffRules", "ForwardDiff", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Requires", "SpecialFunctions", "Statistics", "Test"] 587 | git-tree-sha1 = "327342fec6e09f68ced0c2dc5731ed475e4b696b" 588 | uuid = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" 589 | version = "0.2.2" 590 | 591 | [[TranscodingStreams]] 592 | deps = ["Random", "Test"] 593 | git-tree-sha1 = "a25d8e5a28c3b1b06d3859f30757d43106791919" 594 | uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" 595 | version = "0.9.4" 596 | 597 | [[URIParser]] 598 | deps = ["Test", "Unicode"] 599 | git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69" 600 | uuid = "30578b45-9adc-5946-b283-645ec420af67" 601 | version = "0.4.0" 602 | 603 | [[UUIDs]] 604 | deps = ["Random", "SHA"] 605 | uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" 606 | 607 | [[Unicode]] 608 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" 609 | 610 | [[VersionParsing]] 611 | deps = ["Compat"] 612 | git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669" 613 | uuid = "81def892-9a0e-5fdd-b105-ffc91e053289" 614 | version = "1.1.3" 615 | 616 | [[WoodburyMatrices]] 617 | deps = ["LinearAlgebra", "Random", "SparseArrays", "Test"] 618 | git-tree-sha1 = "21772c33b447757ec7d3e61fcdfb9ea5c47eedcf" 619 | uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6" 620 | version = "0.4.1" 621 | 622 | [[ZipFile]] 623 | deps = ["BinaryProvider", "Libdl", "Printf"] 624 | git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d" 625 | uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" 626 | version = "0.8.3" 627 | --------------------------------------------------------------------------------