├── images
    ├── i7.jpg
    ├── GK110.jpg
    ├── Julia6x.png
    ├── warp-branch.png
    ├── false-sharing.gif
    └── 40-years-processor-trend.png
├── Project.toml
├── 100 Overview.jl
├── README.jl
├── 060 Tasks.jl
├── 080 GPUs.jl
├── 030 SIMD.jl
├── 020 Serial Performance.jl
├── 050 Parallel Algorithms.jl
├── 040 Multithreading.jl
├── 070 Distributed.jl
└── Manifest.toml


/images/i7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/i7.jpg


--------------------------------------------------------------------------------
/images/GK110.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/GK110.jpg


--------------------------------------------------------------------------------
/images/Julia6x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/Julia6x.png


--------------------------------------------------------------------------------
/images/warp-branch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/warp-branch.png


--------------------------------------------------------------------------------
/images/false-sharing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/false-sharing.gif


--------------------------------------------------------------------------------
/images/40-years-processor-trend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbauman/ParallelWorkshop2019/HEAD/images/40-years-processor-trend.png


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | [deps]
 2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 3 | Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 4 | Compose = "a81c6b42-2e10-5240-aca2-a61377ecd94b"
 5 | Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 6 | CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 7 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 8 | DistributedArrays = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
 9 | FileWatching = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
10 | FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
11 | Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
12 | Gadfly = "c91e804a-d5a3-530f-b6f0-dfbca275c004"
13 | Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
14 | PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
15 | SharedArrays = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
16 | 


--------------------------------------------------------------------------------
/100 Overview.jl:
--------------------------------------------------------------------------------
 1 | # # Summary
 2 | #
 3 | # * Challenges of parallel computing
 4 | #     * Order of execution
 5 | #         * execution of out order of Possibility
 6 | #         * simultaneous access and mutation
 7 | #     * Data access and movement
 8 | #     * Code access and movement
 9 | #     * Appropriately matching the parallelism strategy to your machine capabilities
10 | #     * Appropriately matching the parallelism strategy with the problem at hand
11 | #
12 | # * Parallelism strategies
13 | #     * SIMD
14 | #     * Multithreading
15 | #     * Tasks
16 | #     * Multi-process
17 | #         * Shared memory
18 | #         * Distributed memory
19 | #     * GPU programming
20 | #
21 | #
22 | # ## Why so many kinds of parallelism?
23 | #
24 | # * Not all problems are created equal
25 | # * Not all computing machines are created equal
26 | # * We want to maximize comuting while minimizing overhead
27 | #     * Chosen solution will depend upon the amount of computing in each inner loop
28 | #       and the amount of syncronization that is required between loops.
29 | 
30 | 


--------------------------------------------------------------------------------
/README.jl:
--------------------------------------------------------------------------------
 1 | # # JuliaCon 2019 Parallel Computing Workshop
 2 | #
 3 | # This workshop will cover:
 4 | #
 5 | # * Introduction to parallelism
 6 | #     * What is happening to our computers?
 7 | #
 8 | # * Parallelism strategies
 9 | #     * SIMD and best single-core performance (brief overview)
10 | #     * Multi-threading (hands on)
11 | #     * Cooperative multi-tasking
12 | #     * Parallel algorithm design
13 | #     * Multi-process (hands on)
14 | #         * Shared memory
15 | #         * Distributed memory
16 | #     * GPU programming
17 | #
18 | # * Challenges of parallel computing
19 | #     * Order of execution
20 | #         * execution of out order of Possibility
21 | #         * race conditions with simultaneous access and mutation
22 | #     * Data access and movement
23 | #     * Code access and movement
24 | #     * Appropriately matching the parallelism strategy to your machine capabilities
25 | #     * Appropriately matching the parallelism strategy with the problem at hand
26 | 
27 | #-
28 | 
29 | # ## What is happening to our computers!?
30 | #
31 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/40-years-processor-trend.png)
32 | #
33 | # Not only have we gained multiple cores, but processors have become extremely
34 | # complex, with multiple levels of caches, pipelines, predictions, speculations...
35 | #
36 | # ## What is hard about parallel computing
37 | #   * We don't think in parallel
38 | #   * We learn to write and reason about programs serially
39 | #   * The desire for parallelism often comes _after_ you've written your algorithm (and found it too slow!)
40 | #
41 | # ## Summary:
42 | #   * Current computer archetectures push us towards parallel programming for peak performance — even if we're not on a cluster!
43 | #   * But it's hard to design good parallel algorithms
44 | #   * And it's hard to express and reason about those algorithms
45 | 


--------------------------------------------------------------------------------
/060 Tasks.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # A brief introduction to Tasks
  4 | #
  5 | # You're working on a computer that's doing _lots_ of things. It's managing
  6 | # inputs, outputs, delegating control of the CPU between Julia and _all_ of
  7 | # the other applications you have running. This wasn't always the case — does
  8 | # anyone remember the days before you could just switch between applications?
  9 | #
 10 | # It's not really doing all these things at once, but for the most part it
 11 | # gives the _appearance_ of parallelism. We think about our computers as doing
 12 | # _lots_ of things simultaneously — but it's not really simultaneous. It's just
 13 | # switching between tasks so fast that it feels simultaneous.
 14 | #
 15 | # This kind of task switching is perfect for situations like an operating system
 16 | # where you're just waiting for user input most of the time. The OS multitasking
 17 | # you're familiar with is called "preemptive" multitasking — the operating system
 18 | # sits at the top and can arbitrarily control who gets to run when. Julia's task
 19 | # system uses cooperative multitasking (also known as coroutines or green threads).
 20 | 
 21 | #-
 22 | 
 23 | # Tasks work best when they're waiting for some _external_ condition to complete
 24 | # their work. Let's say we had a directory "results" and wanted to process any
 25 | # new files that appeared there:
 26 | 
 27 | using FileWatching
 28 | isdir("results") || mkdir("results")
 29 | watch_folder("results", #= time out in seconds =# 5)
 30 | 
 31 | # Julia happily will sit there and wait for something to happen... but it's
 32 | # blocking anything else from happening while it's doing so! This is the perfect
 33 | # case for a Task. We can say we want a given expression to run asynchronously
 34 | # in a Task with the `@async` macro
 35 | 
 36 | t = @async watch_folder("results") # no timeout means it will wait forever!
 37 | 
 38 | #-
 39 | 
 40 | run(`touch results/0.txt`)
 41 | 
 42 | #-
 43 | 
 44 | file, info = fetch(t)
 45 | file # |> process
 46 | 
 47 | # We can even bundle this up into a repeating task:
 48 | 
 49 | isdone = false
 50 | function process_folder(dir)
 51 |     !isdir("processed-results") && mkdir("processed-results")
 52 |     while !isdone
 53 |         file, info = watch_folder(dir)
 54 |         path = joinpath(dir, file)
 55 |         if isfile(path)
 56 |             print("processing $path...")
 57 |             run(`cp $path processed-results/$file`) # Or actually do real work...
 58 |         end
 59 |     end
 60 | end
 61 | 
 62 | t = @async process_folder("results")
 63 | 
 64 | #-
 65 | 
 66 | run(`touch results/1.txt`)
 67 | readdir("processed-results")
 68 | 
 69 | #-
 70 | 
 71 | run(`touch results/2.txt`)
 72 | readdir("processed-results")
 73 | 
 74 | #-
 75 | 
 76 | isdone = true
 77 | run(`touch results/3.txt`)
 78 | readdir("processed-results")
 79 | 
 80 | #-
 81 | 
 82 | run(`touch results/4.txt`)
 83 | readdir("processed-results")
 84 | 
 85 | #-
 86 | 
 87 | rm("results", recursive=true)
 88 | rm("processed-results", recursive=true)
 89 | 
 90 | # ## Quiz:
 91 | #
 92 | # How long will this take?
 93 | 
 94 | @time for i in 1:10
 95 |     sleep(1)
 96 | end
 97 | 
 98 | # What about this?
 99 | 
100 | @time for i in 1:10
101 |     @async sleep(1)
102 | end
103 | 
104 | # And finally, this?
105 | 
106 | @time @sync for i in 1:10
107 |     @async sleep(1)
108 | end
109 | 
110 | # Now what if I had something that actually did work?
111 | 
112 | function work(N)
113 |     series = 1.0
114 |     for i in 1:N
115 |         series += (isodd(i) ? -1 : 1) / (i*2+1)
116 |     end
117 |     return 4*series
118 | end
119 | work(1)
120 | @time work(100_000_000)
121 | 
122 | #-
123 | 
124 | @time @sync for i in 1:10
125 |     @async work(100_000_000)
126 | end
127 | 
128 | # # So what's happening here?
129 | #
130 | # `sleep` is nicely cooperating with our tasks
131 | 
132 | methods(sleep)
133 | 
134 | # # Fetching values from tasks
135 | 
136 | # You can even fetch values from tasks
137 | 
138 | t = @async (sleep(5); rand())
139 | 
140 | wait(t)
141 | 
142 | fetch(t)
143 | 
144 | # # Key takeaways
145 | #
146 | # There is a lot more to tasks, but they form the foundation for reasoning about
147 | # actually _doing_ computation in parallel (and not just hoping that things will
148 | # cooperate for us to emulate parallelism by task switching).
149 | #
150 | # * `@async` creates and starts running a task
151 | # * `@sync` waits for them to all complete
152 | # * We can reason about something that runs asynchronously and may return a value
153 | #   at some point in the future with `fetch`. Or we can just `wait` for it.
154 | 


--------------------------------------------------------------------------------
/080 GPUs.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # GPUs
  4 | #
  5 | # The graphics processor in your computer is _itself_ like a mini-computer highly
  6 | # tailored for massively and embarassingly parallel operations (like computing how light will bounce
  7 | # off of every point on a 3D mesh of triangles).
  8 | #
  9 | # Of course, recently their utility in other applications has become more clear
 10 | # and thus the GPGPU was born.
 11 | #
 12 | # Just like how we needed to send data to other processes, we need to send our
 13 | # data to the GPU to do computations there.
 14 | 
 15 | #-
 16 | 
 17 | # ## How is a GPU different from a CPU?
 18 | #
 19 | # This is what a typical consumer CPU looks like:
 20 | #
 21 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/i7.jpg)
 22 | #
 23 | # And this is what a GPU looks like:
 24 | #
 25 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/GK110.jpg)
 26 | #
 27 | # Each SMX isn't just one "core", each is a _streaming multiprocessor_ capable of running hundreds of threads simultaneously itself.  There are so many threads, in fact, that you reason about them in groups of 32 — called a "warp."  No, no [that warp](https://www.google.com/search?tbm=isch&q=warp&tbs=imgo:1), [this one](https://www.google.com/search?tbm=isch&q=warp%20weaving&tbs=imgo:1).
 28 | #
 29 | # The card above supports up to 6 warps per multiprocessor, with 32 threads each, times 15 multiprocessors... 2880 threads at a time!
 30 | #
 31 | # Also note the memory interfaces.
 32 | #
 33 | # --------------
 34 | #
 35 | # Each thread is relatively limited — and a warp is almost like a SIMD unit that supports branching. Except it's still only executing one instruction even after a branch:
 36 | #
 37 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/warp-branch.png)
 38 | 
 39 | #-
 40 | 
 41 | # You can inspect the installed GPUs with nvidia-smi:
 42 | 
 43 | run(`nvidia-smi`)
 44 | 
 45 | # ## Example
 46 | #
 47 | # The deep learning MNIST example: https://fluxml.ai/experiments/mnist/
 48 | #
 49 | # This is how it looks on the CPU:
 50 | 
 51 | using Flux, Flux.Data.MNIST, Statistics
 52 | using Flux: onehotbatch, onecold, crossentropy, throttle
 53 | using Base.Iterators: repeated, partition
 54 | 
 55 | imgs = MNIST.images()
 56 | labels = onehotbatch(MNIST.labels(), 0:9)
 57 | 
 58 | ## Partition into batches of size 32
 59 | train = [(cat(float.(imgs[i])..., dims = 4), labels[:,i])
 60 |          for i in partition(1:60_000, 32)]
 61 | ## Prepare test set (first 1,000 images)
 62 | tX = cat(float.(MNIST.images(:test)[1:1000])..., dims = 4)
 63 | tY = onehotbatch(MNIST.labels(:test)[1:1000], 0:9)
 64 | 
 65 | m = Chain(
 66 |         Conv((3, 3), 1=>32, relu),
 67 |         Conv((3, 3), 32=>32, relu),
 68 |         x -> maxpool(x, (2,2)),
 69 |         Conv((3, 3), 32=>16, relu),
 70 |         x -> maxpool(x, (2,2)),
 71 |         Conv((3, 3), 16=>10, relu),
 72 |         x -> reshape(x, :, size(x, 4)),
 73 |         Dense(90, 10), softmax)
 74 | 
 75 | loss(x, y) = crossentropy(m(x), y)
 76 | accuracy(x, y) = mean(onecold(m(x)) .== onecold(y))
 77 | opt = ADAM()
 78 | Flux.train!(loss, train[1:1], opt, cb = () -> @show(accuracy(tX, tY)))
 79 | @time Flux.train!(loss, Flux.params(m), train[1:10], opt, cb = () -> @show(accuracy(tX, tY)))
 80 | 
 81 | # Now let's re-do it on a GPU. "All" it takes is moving the data there with `gpu`!
 82 | 
 83 | include(datapath("scripts/fixupCUDNN.jl")) # JuliaBox uses an old version of CuArrays; this backports a fix for it
 84 | gputrain = gpu.(train[1:10])
 85 | gpum = gpu(m)
 86 | gputX = gpu(tX)
 87 | gputY = gpu(tY)
 88 | gpuloss(x, y) = crossentropy(gpum(x), y)
 89 | gpuaccuracy(x, y) = mean(onecold(gpum(x)) .== onecold(y))
 90 | gpuopt = ADAM()
 91 | Flux.train!(gpuloss, Flux.params(gpum), gpu.(train[1:1]), gpuopt, cb = () -> @show(gpuaccuracy(gputX, gputY)))
 92 | @time Flux.train!(gpuloss, Flux.params(gpum), gputrain, gpuopt, cb = () -> @show(gpuaccuracy(gputX, gputY)))
 93 | 
 94 | # ## Defining your own GPU kernels
 95 | #
 96 | # So that's leveraging Flux's ability to work with GPU arrays — which is magical
 97 | # and awesome — but you don't always have a library to lean on like that.
 98 | # How might you define your own GPU kernel?
 99 | #
100 | # Recall the monte carlo pi example:
101 | 
102 | function serialpi(n)
103 |     inside = 0
104 |     for i in 1:n
105 |         x, y = rand(), rand()
106 |         inside += (x^2 + y^2 <= 1)
107 |     end
108 |     return 4 * inside / n
109 | end
110 | 
111 | # How could we express this on the GPU?
112 | 
113 | using CuArrays.CURAND
114 | function findpi_gpu(n)
115 |     4 * sum(curand(Float64, n).^2 .+ curand(Float64, n).^2 .<= 1) / n
116 | end
117 | findpi_gpu(10_000_000)
118 | 
119 | #-
120 | 
121 | using BenchmarkTools
122 | @btime findpi_gpu(10_000_000)
123 | @btime serialpi(10_000_000)
124 | 
125 | # That leans on broadcast to build the GPU kernel — and is creating three arrays
126 | # in the process — but it's still much faster than our serial pi from before.
127 | 
128 | #-
129 | 
130 | # In general, using CuArrays and broadcast is one of the best ways to just
131 | # get everything to work. If you really want to get your hands dirty, you
132 | # can use [CUDAnative.jl](https://github.com/JuliaGPU/CUDAnative.jl) to manually specify exactly how everything works,
133 | # but be forewarned, it's not for the [faint at heart](https://github.com/JuliaGPU/CUDAnative.jl/blob/master/examples/reduce/reduce.jl)! (If you've done CUDA
134 | # programming in C or C++, it's very similar.)
135 | 


--------------------------------------------------------------------------------
/030 SIMD.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # SIMD: The parallelism that can (sometimes) happen automatically
  4 | #
  5 | # SIMD: Single-instruction, multiple data
  6 | #
  7 | # (Also confusingly called vectorization)
  8 | 
  9 | #-
 10 | 
 11 | # ## The architecture
 12 | #
 13 | # Instead of computing four sums sequentially:
 14 | #
 15 | # \begin{align}
 16 | # x_1 + y_1 &\rightarrow z_1 \\
 17 | # x_2 + y_2 &\rightarrow z_2 \\
 18 | # x_3 + y_3 &\rightarrow z_3 \\
 19 | # x_4 + y_4 &\rightarrow z_4
 20 | # \end{align}
 21 | #
 22 | # Modern processors have vector processing units that can do it all at once:
 23 | #
 24 | # $$
 25 | # \left(\begin{array}{cc}
 26 | # x_1 \\
 27 | # x_2 \\
 28 | # x_3 \\
 29 | # x_4
 30 | # \end{array}\right)
 31 | # +
 32 | # \left(\begin{array}{cc}
 33 | # y_1 \\
 34 | # y_2 \\
 35 | # y_3 \\
 36 | # y_4
 37 | # \end{array}\right)
 38 | # \rightarrow
 39 | # \left(\begin{array}{cc}
 40 | # z_1 \\
 41 | # z_2 \\
 42 | # z_3 \\
 43 | # z_4
 44 | # \end{array}\right)
 45 | # $$
 46 | 
 47 | #-
 48 | 
 49 | # ## Making it happen
 50 | 
 51 | #-
 52 | 
 53 | # Simple task: compute the sum of a vector:
 54 | 
 55 | A = rand(100_000)
 56 | function simplesum(A)
 57 |     result = zero(eltype(A))
 58 |     for i in eachindex(A)
 59 |         @inbounds result += A[i]
 60 |     end
 61 |     return result
 62 | end
 63 | 
 64 | simplesum(A)
 65 | 
 66 | #-
 67 | 
 68 | using BenchmarkTools
 69 | @btime simplesum($A)
 70 | 
 71 | # So, is that good?
 72 | 
 73 | @btime sum($A)
 74 | 
 75 | # We're slower that the builtin `sum` — and we're getting a different answer, too! Let's look at what happens with a 32-bit float instead of a 64 bit one. Each element has half the number of bits, so lets also double the length (so the total number of bits processed remains constant).
 76 | 
 77 | A32 = rand(Float32, length(A)*2)
 78 | @btime simplesum($A32)
 79 | @btime sum($A32);
 80 | 
 81 | # That's even worse! What's going on here?  We're seeing an even multiple number
 82 | # difference in our performance — perhaps Julia's builtin sum is using some
 83 | # parallelism? Let's try using SIMD ourselves:
 84 | 
 85 | function simdsum(A)
 86 |     result = zero(eltype(A))
 87 |     @simd for i in eachindex(A)
 88 |         @inbounds result += A[i]
 89 |     end
 90 |     return result
 91 | end
 92 | @btime simdsum($A)
 93 | @btime simdsum($A32)
 94 | 
 95 | # What did that do and why don't we always use `@simd for` — or why doesn't Julia
 96 | # just always use `@simd` for every `for` loop automatically?  Look at the values:
 97 | 
 98 | simplesum(A), simdsum(A), sum(A)
 99 | 
100 | #-
101 | 
102 | simplesum(A32), simdsum(A32), sum(A32)
103 | 
104 | # Why aren't they the same?
105 | #
106 | # Without `@simd`, Julia is doing _exactly_ what we told it to do: it's taking
107 | # each element of our array and adding it to a big pile sequentially. Our answer
108 | # is smaller than what Julia's builtin `sum` thinks it is: that's because as our
109 | # pile gets bigger we begin losing the lower bits of each element that we're
110 | # adding, and those small losses begin to add up!
111 | #
112 | # The `@simd` macro tells Julia that it can re-arrange floating point additions —
113 | # even if it would change the answer. Depending on your CPU, this may lead to 2x or 4x
114 | # or even 8x parallelism. Essentially, Julia is computing independent sums for
115 | # the even indices and the odd indices simultaneously:
116 | #
117 | # \begin{align}
118 | # odds &\leftarrow 0 \\
119 | # evens &\leftarrow 0 \\
120 | # \text{loop}&\ \text{odd}\ i: \\
121 | #     &\left(\begin{array}{cc}
122 | # odds \\
123 | # evens
124 | # \end{array}\right)
125 | # \leftarrow
126 | # \left(\begin{array}{cc}
127 | # odds \\
128 | # evens
129 | # \end{array}\right)
130 | # +
131 | # \left(\begin{array}{cc}
132 | # x_{i} \\
133 | # x_{i+1}
134 | # \end{array}\right) \\
135 | # total &\leftarrow evens + odds
136 | # \end{align}
137 | #
138 | # In many cases, Julia can and does know that a for-loop can be SIMD-ed and it
139 | # will take advantage of this by default!
140 | 
141 | B = rand(1:10, 100_000)
142 | @btime simplesum($B)
143 | @btime sum($B)
144 | B32 = rand(Int32(1):Int32(10), length(B)*2)
145 | @btime simplesum($B32)
146 | @btime simdsum($B32)
147 | 
148 | # How can we see if something is getting vectorized?
149 | 
150 | @code_llvm simdsum(A32)
151 | 
152 | # So what are the challenges?
153 | #
154 | # * Biggest hurdle is that you have to convince Julia and LLVM that it's able to
155 | #   use SIMD instructions for your given algorithm. That's not always possible.
156 | # * There are lots of limitations of what can and cannot be SIMD-ed:
157 | 
158 | @doc @simd
159 | 
160 | # * You do need to think through the consequences of re-ordering your algorithm.
161 | 
162 | #-
163 | 
164 | # ## A slightly trickier case
165 | 
166 | using BenchmarkTools
167 | 
168 | #-
169 | 
170 | function diff!(A, B)
171 |     A[1] = B[1]
172 |     for i in 2:length(A)
173 |         @inbounds A[i] = B[i] - B[i-1]
174 |     end
175 |     return A
176 | end
177 | A = zeros(Float32, 100_000)
178 | B = rand(Float32, 100_000)
179 | 
180 | diff!(A, B)
181 | [B[1];diff(B)] == A
182 | 
183 | #-
184 | 
185 | @btime diff!($A, $B)
186 | @btime diff($B);
187 | 
188 | # But what happens if we do it in-place?
189 | 
190 | Bcopy = copy(B)
191 | @btime diff!($Bcopy, $Bcopy);
192 | 
193 | # What happened?
194 | 
195 | @code_llvm diff!(A, B)
196 | 
197 | # We can manually assert that arrays don't alias (or have any loop-dependencies),
198 | # with the very special `@simd ivdep` flag, but this can be disastrous:
199 | 
200 | function unsafe_diff!(A, B)
201 |     A[1] = B[1]
202 |     @simd ivdep for i in 2:length(A)
203 |         @inbounds A[i] = B[i] - B[i-1]
204 |     end
205 |     return A
206 | end
207 | @btime unsafe_diff!($A, $B)
208 | [B[1];diff(B)] == A
209 | Bcopy = copy(B)
210 | unsafe_diff!(Bcopy, Bcopy)
211 | [B[1];diff(B)] == Bcopy
212 | 
213 | # If you really want to get your hands dirty, you can use the [SIMD.jl](https://github.com/eschnett/SIMD.jl)
214 | # package to manually specify those `<8 x float>` things that LLVM generates.
215 | # BUT: this is tricky and a pain; often it's just to be aware of what makes
216 | # Julia code automatically SIMD-able, some of the cases where it may fail, and
217 | # how to check its work.
218 | 
219 | #-
220 | 
221 | # ## SIMD
222 | #
223 | # * Exploits built-in parallelism in a processor
224 | # * Best for small, tight innermost loops
225 | # * Often happens automatically if you're careful
226 | #     * Follow the [perforance best practices](https://docs.julialang.org/en/v1/manual/performance-tips/)
227 | #     * `@inbounds` any array acesses
228 | #     * No branches or (non-inlined) function calls
229 | # * Can use `@simd` to allow Julia to break some rules to make it happen
230 | #     * But be careful, especially with `@simd ivdep`!
231 | # * Depending on processor and types involved, can yield 2-16x gains with extraordinarily little overhead
232 | #     * Smaller datatypes can improve this further; use `Float32` instead of `Float64`
233 | #       if possible, `Int32` instead of `Int64`, etc.
234 | #     * When buying a new processor, look for [AVX-512](https://en.wikichip.org/wiki/x86/avx-512) support
235 | 


--------------------------------------------------------------------------------
/020 Serial Performance.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # Fast (serial) programming with Julia
  4 | #
  5 | # Yes, this is a parallel computing course — but to write efficient parallel
  6 | # programs we first must learn how to write fast serial Julia code. This is
  7 | # a rapid primer in high performance (serial) programming.
  8 | #
  9 | # I _highly_ recommend reviewing the [Performance Tips](https://docs.julialang.org/en/v1.1/manual/performance-tips/)
 10 | # in the manual. This is only going to briefly introduce some of the main concepts.
 11 | 
 12 | #-
 13 | 
 14 | # ## Measure, measure, measure.
 15 | #
 16 | # It is very easy to experiment in Julia; you can rapidly try many options and
 17 | # see what is the fastest.
 18 | 
 19 | #-
 20 | 
 21 | # Use the [BenchmarkTools](https://github.com/JuliaCI/BenchmarkTools.jl) package:
 22 | 
 23 | using BenchmarkTools
 24 | 
 25 | """
 26 |     findclosest(data, point)
 27 | 
 28 | A simple example that returns the element in `data` that is closest to `point`
 29 | """
 30 | function findclosest(data, point)
 31 |     _, index =  findmin(abs.(data .- point))
 32 |     return data[index]
 33 | end
 34 | data = rand(5000)
 35 | findclosest(data, 0.5)
 36 | 
 37 | #-
 38 | 
 39 | @time findclosest(data, 0.5)
 40 | 
 41 | #-
 42 | 
 43 | @benchmark findclosest($data, $0.5)
 44 | 
 45 | # ### Profile!
 46 | 
 47 | using Profile
 48 | 
 49 | Profile.clear()
 50 | @profile for _ in 1:100000; findclosest(data, 0.5); end
 51 | 
 52 | Profile.print(maxdepth=11)
 53 | 
 54 | # ### Iterate!
 55 | #
 56 | # Before we had:
 57 | # ```julia
 58 | # function findclosest(data, point)
 59 | #     _, index =  findmin(abs.(data .- point))
 60 | #     return data[index]
 61 | # end
 62 | # ```
 63 | #
 64 | # Let's come up with a new definition that can combine the two operations:
 65 | 
 66 | function findclosest2(data, point)
 67 |     bestval = first(data)
 68 |     bestdist = abs(bestval - point)
 69 |     for elt in data
 70 |         dist = abs(elt - point)
 71 |         if dist < bestdist
 72 |             bestval = elt
 73 |             bestdist = dist
 74 |         end
 75 |     end
 76 |     return bestval
 77 | end
 78 | 
 79 | ## And do a spot-check to make sure we did the optimization correctly:
 80 | findclosest2(data, 0.5) == findclosest(data, .5)
 81 | 
 82 | #-
 83 | 
 84 | @benchmark findclosest2($data, $0.5)
 85 | 
 86 | # ## A quick word on macros
 87 | #
 88 | # Macros are those funny things starting with `@`. They can reinterpret what
 89 | # you write and do something different — essentially introducing a new keyword.
 90 | #
 91 | # For example, the `@assert` macro simply takes an expression and throws an
 92 | # exception if it returns `false`.
 93 | 
 94 | @assert 2+2 == 4
 95 | 
 96 | # It does this by literally re-writing what you wrote. You can see it in action
 97 | # with `@macroexpand`
 98 | 
 99 | @macroexpand @assert 2+2 == 4
100 | 
101 | # Each macro can define its own special syntax, and this is used extensively for
102 | # code introspection, serial performance improvements, and — perhaps most
103 | # importantly — parallelization perimitives!
104 | 
105 | #-
106 | 
107 | # ## How is Julia fast?
108 | #
109 | # By understanding the basics of how Julia _can_ be fast, you can get a better
110 | # sense for how to write fast Julia code.
111 | #
112 | # Perhaps most importantly, Julia can reason about types. Recall: this is the definition of `findclosest2`:
113 | #
114 | # ```julia
115 | # function findclosest2(data, point)
116 | #     bestval = first(data)
117 | #     bestdist = abs(bestval - point)
118 | #     for elt in data
119 | #         dist = abs(elt - point)
120 | #         if dist < bestdist
121 | #             bestval = elt
122 | #             bestdist = dist
123 | #         end
124 | #     end
125 | #     return bestval
126 | # end
127 | # ```
128 | 
129 | @code_typed optimize=false findclosest2(data, 0.5)
130 | 
131 | #-
132 | 
133 | typeof(data)
134 | 
135 | #-
136 | 
137 | newdata = Real[data...]
138 | typeof(newdata)
139 | 
140 | #-
141 | 
142 | @code_typed optimize=false findclosest2(newdata, 0.5)
143 | 
144 | #-
145 | 
146 | @benchmark findclosest2(newdata, 0.5)
147 | 
148 | #-
149 | 
150 | @code_warntype findclosest2(newdata, 0.5)
151 | 
152 | # ### Type stability
153 | #
154 | # A function is called type-stable if Julia is able to infer what the output
155 | # type will be based purely on the types of the inputs.
156 | #
157 | # Things that thwart type stability:
158 | # * Running things in global scope: create functions instead!
159 | # * Non-concretely typed containers
160 | # * Structs with abstractly-typed fields
161 | # * Non-constant globals (they might change!)
162 | # * Functions that change what they return based on the _values_:
163 | 
164 | #-
165 | 
166 | # #### More on macros
167 | #
168 | # Each and every macro can define its own syntax. The `@benchmark` macro uses `$` in a special way.
169 | # The goal behind `@benchmark` is to evaluate the performance of a code snippet
170 | # as though it were written in a function. Use `$` to flag what will be an argument
171 | # or local variable in the function. Forgetting to use `$`s may result in faster
172 | # or slower timings than real-world performance.
173 | 
174 | x = 0.5 # non-constant global
175 | @btime sin(x)
176 | @btime sin($x)
177 | 
178 | #-
179 | 
180 | @btime sin(0.5) # constant literal!
181 | @btime sin($0.5)
182 | 
183 | # ### Specializations
184 | #
185 | # Julia's reasoning about types is particularly important since it generates
186 | # specialized machine code specifically for the given arguments.
187 | 
188 | @code_llvm 1 + 2
189 | 
190 | # This applies just the same to any functions we write — even the more complicated ones:
191 | 
192 | @code_llvm findclosest2(Float32[2.2,3.4,4.5],Float32(3.2))
193 | 
194 | # This applies just the same to any functions we write — even the more complicated ones:
195 | 
196 | remove_comments(s) = join(filter(x->!startswith(x, ";"), split(s, "\n")), "\n")
197 | sprint(code_llvm, findclosest2, Tuple{Vector{Float32}, Int}) |> remove_comments |> print
198 | 
199 | # ## Modern hardware effects
200 | #
201 | # There are lots of little performance quirks in modern computers; I'll just
202 | # cover two interesting ones here:
203 | 
204 | @benchmark findclosest2($data, $0.5)
205 | 
206 | #-
207 | 
208 | sorteddata = sort(data)
209 | @benchmark findclosest2($sorteddata, $0.5)
210 | 
211 | # Unfortunately, this isn't demonstrable on a hardened cloud platform... because
212 | # it's a huge security risk!
213 | #
214 | # * https://meltdownattack.com
215 | # * https://discourse.julialang.org/t/psa-microbenchmarks-remember-branch-history/17436
216 | 
217 | idxs = sortperm(data)
218 | sortedview = @view data[idxs]
219 | @benchmark findclosest2($sortedview, $0.5)
220 | 
221 | # ### Memory latencies
222 | #
223 | # | System Event                   | Actual Latency | Scaled Latency |
224 | # | ------------------------------ | -------------- | -------------- |
225 | # | One CPU cycle                  |     0.4 ns     |     1 s        |
226 | # | Level 1 cache access           |     0.9 ns     |     2 s        |
227 | # | Level 2 cache access           |     2.8 ns     |     7 s        |
228 | # | Level 3 cache access           |      28 ns     |     1 min      |
229 | # | Main memory access (DDR DIMM)  |    ~100 ns     |     4 min      |
230 | # | Intel Optane memory access     |     <10 μs     |     7 hrs      |
231 | # | NVMe SSD I/O                   |     ~25 μs     |    17 hrs      |
232 | # | SSD I/O                        |  50–150 μs     | 1.5–4 days     |
233 | # | Rotational disk I/O            |    1–10 ms     |   1–9 months   |
234 | # | Internet call: SF to NYC       |      65 ms     |     5 years    |
235 | # | Internet call: SF to Hong Kong |     141 ms     |    11 years    |
236 | #
237 | #  (from https://www.prowesscorp.com/computer-latency-at-a-human-scale/)
238 | 
239 | #-
240 | 
241 | # # Key Takeaways
242 | #
243 | # * Measure, measure, measure!
244 | # * Get familiar with the [Performance Tips](https://docs.julialang.org/en/v1/manual/performance-tips/)
245 | # * Don't be scared of `@code_typed`/`@code_warntype` and `@code_llvm`
246 | 


--------------------------------------------------------------------------------
/050 Parallel Algorithms.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # Parallel Algorithms: Thinking in Parallel
  4 | #
  5 | # Now that we're starting to see the challenges of parallelism, it's worth taking
  6 | # a step back and examining how we might go about designing parallel algorithms.
  7 | #
  8 | # This is adapted from a [workshop paper](http://jiahao.github.io/parallel-prefix/) by Jiahao Chen and
  9 | # Alan Edelman entitled "Parallel Prefix Polymorphism Permits Parallelization, Presentation & Proof" and
 10 | # will appear in the proceedings of the [First Workshop for High Performance Technical Computing in Dynamic
 11 | # Languages](http://jiahao.github.io/hptcdl-sc14/), held in conjunction with [SC14: The International Conference on High Performance Computing, Networking, Storage and Analysis](http://sc14.supercomputing.org/)
 12 | 
 13 | #-
 14 | 
 15 | using Compose, Gadfly
 16 | 
 17 | # # `reduce()`
 18 | #
 19 | # Reduction applies a binary operator to a vector repeatedly to return a scalar. Thus + becomes sum, and * becomes prod.
 20 | #
 21 | # It is considered a basic parallel computing primitive.
 22 | 
 23 | reduce(+, 1:8) == sum(1:8)  # triangular numbers
 24 | 
 25 | #-
 26 | 
 27 | reduce(*, 1:8) == prod(1:8) # factorials
 28 | 
 29 | 
 30 | # You can also use reduce to compute Fibonacci numbers using their recurrences.
 31 | 
 32 | M=[1 1; 1 0]
 33 | reduce(*,fill(M,3))
 34 | prod(fill(M,3))
 35 | 
 36 | #-
 37 | 
 38 | n= 40 # Try changing n to pick different values (try between 0-100)
 39 | @show prod(fill(big.(M),n))
 40 | 
 41 | # # `prefix` or `scan`
 42 | #
 43 | # Having discussed `reduce`, we are now ready for the idea behind prefix sum.
 44 | # Prefix or scan or accumulate is long considered an important parallel
 45 | # primitive as well.
 46 | #
 47 | # Suppose you wanted to compute the partial sums of a vector, i.e. given
 48 | # `y[1:n]`, we want to overwrite the vector `y` with the vector of partial sums
 49 | #
 50 | # ```julia
 51 | # new_y[1] = y[1]
 52 | # new_y[2] = y[1] + y[2]
 53 | # new_y[3] = y[1] + y[2] + y[3]
 54 | # ...
 55 | # ```
 56 | #
 57 | # At first blush, it seems impossible to parallelize this, since
 58 | #
 59 | # ```julia
 60 | # new_y[1] = y[1]
 61 | # new_y[2] = new_y[1] + y[2]
 62 | # new_y[3] = new_y[2] + y[3]
 63 | # ...
 64 | # ```
 65 | #
 66 | # which appears to be an intrinsically serial process. As written with a `+`
 67 | # operator, this is `cumsum` — but note that it can generalize to any operation.
 68 | 
 69 | function prefix_serial!(⊕, y)
 70 |     for i=2:length(y)
 71 |         y[i] = y[i-1] ⊕ y[i]
 72 |     end
 73 |     y
 74 | end
 75 | 
 76 | #-
 77 | 
 78 | @show prefix_serial!(+, [1:8;])
 79 | @show cumsum(1:8)
 80 | 
 81 | #-
 82 | 
 83 | @show prefix_serial!(*, [1:8;])
 84 | @show cumprod(1:8)
 85 | 
 86 | #-
 87 | @show accumulate(*, [1:8;])
 88 | 
 89 | # However, it turns out that because these operations are associative, we can regroup the _order_ of how these sums or products are carried out. (This of course extends to other associative operations, too.) Another ordering of 8 associative operations is provided by `prefix8!`:
 90 | 
 91 | ## Magic :)
 92 | function prefix8!(⊕, y)
 93 |     length(y)==8 || error("length 8 only")
 94 |     for i in (2,4,6,8); y[i] = y[i-1] ⊕ y[i]; end
 95 |     for i in (  4,  8); y[i] = y[i-2] ⊕ y[i]; end
 96 |     for i in (      8); y[i] = y[i-4] ⊕ y[i]; end
 97 |     for i in (    6  ); y[i] = y[i-2] ⊕ y[i]; end
 98 |     for i in ( 3,5,7 ); y[i] = y[i-1] ⊕ y[i]; end
 99 |     y
100 | end
101 | 
102 | #-
103 | 
104 | prefix8!(+, [1:8;]) == cumsum(1:8)
105 | 
106 | # In fact, this can generalize beyond just length-8 arrays:
107 | 
108 | ## More magic
109 | function prefix!(⊕, y)
110 |     l=length(y)
111 |     k=ceil(Int, log2(l))
112 |     @inbounds for j=1:k, i=2^j:2^j:min(l, 2^k)              #"reduce"
113 |         y[i] = y[i-2^(j-1)] ⊕ y[i]
114 |     end
115 |     @inbounds for j=(k-1):-1:1, i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
116 |         y[i] = y[i-2^(j-1)] ⊕ y[i]
117 |     end
118 |     y
119 | end
120 | 
121 | # -
122 | 
123 | A = rand(0:9, 123)
124 | prefix!(*, copy(A)) == cumprod(A)
125 | 
126 | # ## What is this magic?
127 | 
128 | #-
129 | 
130 | # We can visualize the operations with a little bit of trickery. In Julia, arrays are simply types that expose the array protocol. In particular, they need to implement  methods for the generic functions `length`, `getindex` and `setindex!`. The last two are used in indexing operations, since statements
131 | #
132 | #     y = A[1]
133 | #     A[3] = y
134 | #
135 | # get desugared to
136 | #
137 | #     y = getindex(A, 1)
138 | #     setindex!(A, y, 3)
139 | #
140 | # respectively.
141 | #
142 | # We can trace through the iterable by introduce a dummy array type, `AccessArray`, which records every access to `getindex` and `setindex!`.
143 | #
144 | # Specifically:
145 | #
146 | # - `length(A::AccessArray)` returns the length of the array it wraps
147 | # - `getindex(A::AccessArray, i)` records read access to the index `i` in the `A.read` field and then actually retuns the value in the array it wraps.
148 | # - `setindex!(A::AccessArray, x, i)` records write access to the index `i`. The `A.history` field is appended with a new tuple consisting of the current `A.read` field and the index `i`, and then it performs the assignment.
149 | #
150 | # The way `AccessArray` works, it assumes an association between a single `setindex!` call and and all the preceding `getindex` calls since the previous `setindex!` call, which is sufficient for the purposes of tracing through prefix calls.
151 | 
152 | mutable struct AccessArray{T,N,A}
153 |     data :: A
154 |     read :: Vector{Int}
155 |     history :: Vector{Tuple{Vector{Int},Int}}
156 | end
157 | AccessArray(A) = AccessArray{eltype(A), ndims(A), typeof(A)}(A, Int[], Int[])
158 | 
159 | Base.length(A::AccessArray) = length(A.data)
160 | 
161 | function Base.getindex(A::AccessArray, i::Int)
162 |     push!(A.read, i)
163 |     A.data[i]
164 | end
165 | 
166 | function Base.setindex!(A::AccessArray, x, i::Int)
167 |     push!(A.history, (A.read, i))
168 |     A.read = Int[]
169 |     A.data[i] = x
170 | end
171 | 
172 | #-
173 | 
174 | M = AccessArray(rand(8))
175 | 
176 | #-
177 | 
178 | M[7] = M[3] + M[2]
179 | 
180 | #-
181 | 
182 | M.history
183 | 
184 | # So now we can trace the access pattern when calling `prefix8`!
185 | 
186 | A=prefix8!(+, AccessArray(rand(8)))
187 | 
188 | #-
189 | 
190 | A.history
191 | 
192 | # Now let's visualize this! Each entry in `A.history` is rendered by a gate object:
193 | 
194 | using Compose: circle, mm
195 | 
196 | #-
197 | 
198 | struct Gate{I,O}
199 |     ins :: I
200 |     outs :: O
201 | end
202 | 
203 | import Gadfly.render
204 | 
205 | function render(G::Gate, x₁, y₁, y₀; rᵢ=0.1, rₒ=0.25)
206 |     ipoints = [(i, y₀+rᵢ) for i in G.ins]
207 |     opoints = [(i, y₀+0.5) for i in G.outs]
208 |     igates  = [circle(i..., rᵢ) for i in ipoints]
209 |     ogates  = [circle(i..., rₒ) for i in opoints]
210 |     lines = [line([i, j]) for i in ipoints, j in opoints]
211 |     compose(context(units=UnitBox(0.5,0,x₁,y₁+1)),
212 |     compose(context(), stroke(colorant"black"), fill(colorant"white"),
213 |             igates..., ogates...),
214 |     compose(context(), linewidth(0.3mm), stroke(colorant"black"),
215 |             lines...))
216 | end
217 | 
218 | A=Gate([1,2],2)
219 | render(A,2,0,0)
220 | 
221 | # Now we render the whole algorithm. We have to scan through the trace twice; the first time merely calculates the maximum depth that needs to be drawn and the second time actually generates the objects.
222 | 
223 | function render(A::AccessArray)
224 |     #Scan to find maximum depth
225 |     olast = depth = 0
226 |     for y in A.history
227 |         (any(y[1] .≤ olast)) && (depth += 1)
228 |         olast = maximum(y[2])
229 |     end
230 |     maxdepth = depth
231 | 
232 |     olast = depth = 0
233 |     C = []
234 |     for y in A.history
235 |         (any(y[1] .≤ olast)) && (depth += 1)
236 |         push!(C, render(Gate(y...), length(A), maxdepth, depth))
237 |         olast = maximum(y[2])
238 |     end
239 | 
240 |     push!(C, compose(context(units=UnitBox(0.5,0,length(A),1)),
241 |       [line([(i,0), (i,1)]) for i=1:length(A)]...,
242 |     linewidth(0.1mm), stroke(colorant"grey")))
243 |     compose(context(), C...)
244 | end
245 | 
246 | #-
247 | 
248 | render(prefix!(+, AccessArray(zeros(8))))
249 | 
250 | # Now we can see that `prefix!` rearranges the operations to form two spanning trees:
251 | # Try changing the number of elements!
252 | 
253 | render(prefix!(+, AccessArray(zeros(16))))
254 | 
255 | # as contrasted with the serial code:
256 | 
257 | render(prefix_serial!(+, AccessArray(zeros(8))))
258 | 
259 | # # Now exploit the parallelism in the _algorithm_ to use a parallel _implementation_
260 | 
261 | using .Threads
262 | function prefix_threads!(⊕, y)
263 |     l=length(y)
264 |     k=ceil(Int, log2(l))
265 |     for j=1:k
266 |         @threads for i=2^j:2^j:min(l, 2^k)       #"reduce"
267 |             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
268 |         end
269 |     end
270 |     for j=(k-1):-1:1
271 |         @threads for i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
272 |             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
273 |         end
274 |     end
275 |     y
276 | end
277 | 
278 | A = rand(500_000);
279 | 
280 | using BenchmarkTools
281 | @btime prefix_serial!(+, $(copy(A)));
282 | @btime prefix!(+, $(copy(A)));
283 | @btime prefix_threads!(+, $(copy(A)));
284 | 
285 | prefix_threads!(+, copy(A)) == prefix!(+, copy(A)) ≈ cumsum(A)
286 | 
287 | # # Thinking in parallel
288 | #
289 | # Notice how we didn't need to contort ourselves in making our algorithm
290 | # work with `@threads`. We really did _just_ take a `@threads` on it and it
291 | # just worked. It was both accurate _and_ fast.
292 | #
293 | # Coming up with rearrangements that make your particular algorithm parallel
294 | # friendly isn't always easy, but when possible it makes everything else
295 | # just fall out naturally.
296 | #
297 | # Finally, note that there can be clever ways to visualize algorithms as sanity checks.
298 | 


--------------------------------------------------------------------------------
/040 Multithreading.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # Multithreading
  4 | #
  5 | # Now we're finally ready to start talking about running things on multiple
  6 | # processors! Most computers (even cell phones) these days have multiple cores
  7 | # or processors — so the obvious place to start working with parallelism is
  8 | # making use of those from within our Julia process.
  9 | #
 10 | # The first challenge, though, is knowing precisely how many "processors" you have.
 11 | # "Processors" is in scare quotes because, well, it's complicated.
 12 | 
 13 | versioninfo(verbose = true)
 14 | 
 15 | #-
 16 | 
 17 | using Hwloc
 18 | Hwloc.num_physical_cores()
 19 | 
 20 | # What your computer reports as the number of processors might not be the same
 21 | # as the total number of "cores". While sometimes virtual processors can add
 22 | # performance, parallelizing a typical numerical computation over these virtual
 23 | # processors will lead to significantly worse performance because they still
 24 | # have to share much of the nuts and bolts of the computation hardware.
 25 | 
 26 | #-
 27 | 
 28 | # Julia is somewhat multithreaded by default! BLAS calls (like matrix multiplication) are
 29 | # already threaded:
 30 | 
 31 | using BenchmarkTools
 32 | A = rand(1000, 1000);
 33 | B = rand(1000, 1000);
 34 | @benchmark $A*$B
 35 | 
 36 | # This is — by default — already using all your CPU cores! You can see the effect
 37 | # by changing the number of threads (which BLAS supports doing dynamically):
 38 | 
 39 | using LinearAlgebra
 40 | BLAS.set_num_threads(1)
 41 | @benchmark $A*$B
 42 | BLAS.set_num_threads(4)
 43 | @benchmark $A*$B
 44 | 
 45 | # ## What does it look like to implement your _own_ threaded algorithm?
 46 | 
 47 | using .Threads
 48 | 
 49 | nthreads()
 50 | 
 51 | # Julia currently needs to start up knowing that it has threading support enabled.
 52 | #
 53 | # You do that with a environment variable. To get four threads, start Julia with:
 54 | #
 55 | # ```
 56 | # JULIA_NUM_THREADS=4 julia
 57 | # ```
 58 | 
 59 | run(`env JULIA_NUM_THREADS=4 julia -E 'using .Threads; nthreads()'`)
 60 | 
 61 | # The other way to do it is in JuliaPro itself:
 62 | #
 63 | # * Go to the Julia Menu -> Settings -> Number of Threads
 64 | # * By default it'll choose a "good" number for you
 65 | 
 66 | threadid()
 67 | 
 68 | # So we're currently on thread 1. Of course a loop like this will
 69 | # just set the first element to one a number of times:
 70 | 
 71 | A = zeros(Int, nthreads())
 72 | for i in 1:nthreads()
 73 |     A[i] = threadid()
 74 | end
 75 | A
 76 | 
 77 | # But if we prefix it with `@threads` then the loop body is divided between threads!
 78 | 
 79 | @threads for i in 1:nthreads()
 80 |     A[i] = threadid()
 81 | end
 82 | A
 83 | 
 84 | # So let's try implementing our first simple threaded algorithm — `sum`:
 85 | 
 86 | function threaded_sum1(A)
 87 |     r = zero(eltype(A))
 88 |     @threads for i in eachindex(A)
 89 |         @inbounds r += A[i]
 90 |     end
 91 |     return r
 92 | end
 93 | 
 94 | A = rand(10_000_000)
 95 | threaded_sum1(A)
 96 | @time threaded_sum1(A)
 97 | 
 98 | #-
 99 | 
100 | sum(A)
101 | @time sum(A)
102 | 
103 | # Whoa! What happened? Not only did we get the wrong answer, it was _slow_ to get it!
104 | 
105 | function threaded_sum2(A)
106 |     r = Atomic{eltype(A)}(zero(eltype(A)))
107 |     @threads for i in eachindex(A)
108 |         @inbounds atomic_add!(r, A[i])
109 |     end
110 |     return r[]
111 | end
112 | 
113 | threaded_sum2(A)
114 | @time threaded_sum2(A)
115 | 
116 | # Alright! Now we got the correct answer (modulo some floating point associativity),
117 | # but it's still slower than just doing the simple thing on 1 core.
118 | 
119 | threaded_sum2(A) ≈ sum(A)
120 | 
121 | # But it's still slow! Using atomics is much slower than just adding integers
122 | # because we constantly have to go and check _which_ processor has the latest
123 | # work! Also remember that each thread is running on its own processor — and
124 | # that processor also supports SIMD!  Well, that is if it didn't need to worry
125 | # about syncing up with the other processors...
126 | 
127 | function threaded_sum3(A)
128 |     r = Atomic{eltype(A)}(zero(eltype(A)))
129 |     len, rem = divrem(length(A), nthreads())
130 |     @threads for t in 1:nthreads()
131 |         rₜ = zero(eltype(A))
132 |         @simd for i in (1:len) .+ (t-1)*len
133 |             @inbounds rₜ += A[i]
134 |         end
135 |         atomic_add!(r, rₜ)
136 |     end
137 |     # catch up any stragglers
138 |     result = r[]
139 |     @simd for i in length(A)-rem+1:length(A)
140 |         @inbounds result += A[i]
141 |     end
142 |     return result
143 | end
144 | 
145 | threaded_sum3(A)
146 | @time threaded_sum3(A)
147 | 
148 | # Dang, that's complicated. There's also a problem:
149 | 
150 | threaded_sum3(rand(10) .+ rand(10)im) # try an array of complex numbers!
151 | 
152 | # Isn't there an easier way?
153 | 
154 | R = zeros(eltype(A), nthreads())
155 | 
156 | #-
157 | 
158 | function threaded_sum4(A)
159 |     R = zeros(eltype(A), nthreads())
160 |     @threads for i in eachindex(A)
161 |         @inbounds R[threadid()] += A[i]
162 |     end
163 |     r = zero(eltype(A))
164 |     # sum the partial results from each thread
165 |     for i in eachindex(R)
166 |         @inbounds r += R[i]
167 |     end
168 |     return r
169 | end
170 | 
171 | threaded_sum4(A)
172 | @time threaded_sum4(A)
173 | 
174 | # This sacrifices our ability to `@simd` so it's a little slower, but at least we don't need to worry
175 | # about all those indices! And we also don't need to worry about atomics and
176 | # can again support arrays of any elements:
177 | 
178 | threaded_sum4(rand(10) .+ rand(10)im)
179 | 
180 | # ## Key takeaways from `threaded_sum`:
181 | #
182 | # * Beware shared state across threads — it may lead to wrong answers!
183 | #     * Protect yourself by using atomics (or [locks/mutexes](https://docs.julialang.org/en/v1/base/multi-threading/#Synchronization-Primitives-1))
184 | #     * Better yet: divide up the work manually such that the inner loops don't
185 | #       share state. `@threads for i in 1:nthreads()` is a handy idiom.
186 | #     * Alternatively, just use an array and only access a single thread's elements
187 | 
188 | #-
189 | 
190 | # # Beware of global state (even if it's not obvious!)
191 | #
192 | # Another class of algorithm that you may want to parallelize is a monte-carlo
193 | # problem. Since each iteration is a new random draw, and since you're interested
194 | # in looking at the aggregate result, this seems like it should lend itself to
195 | # parallelism quite nicely!
196 | 
197 | using BenchmarkTools
198 | 
199 | #-
200 | 
201 | function serialpi(n)
202 |     inside = 0
203 |     for i in 1:n
204 |         x, y = rand(), rand()
205 |         inside += (x^2 + y^2 <= 1)
206 |     end
207 |     return 4 * inside / n
208 | end
209 | serialpi(1)
210 | @time serialpi(100_000_000)
211 | 
212 | 
213 | # Let's use the techniques we learned above to make a fast threaded implementation:
214 | 
215 | function threadedpi(n)
216 |     inside = zeros(Int, nthreads())
217 |     @threads for i in 1:n
218 |         x, y = rand(), rand()
219 |         @inbounds inside[threadid()] += (x^2 + y^2 <= 1)
220 |     end
221 |     return 4 * sum(inside) / n
222 | end
223 | threadedpi(240)
224 | @time threadedpi(100_000_000)
225 | 
226 | # Ok, now why didn't that work?  It's slow! Let's look at the sequence of random
227 | # numbers that we generate:
228 | 
229 | import Random
230 | Random.seed!(0)
231 | N = 20000
232 | Rserial = zeros(N)
233 | for i in 1:N
234 |     Rserial[i] = rand()
235 | end
236 | Rserial
237 | 
238 | #-
239 | 
240 | Random.seed!(0)
241 | Rthreaded = zeros(N)
242 | @threads for i in 1:N
243 |     Rthreaded[i] = rand()
244 | end
245 | Rthreaded
246 | 
247 | #-
248 | 
249 | Set(Rserial) == Set(Rthreaded)
250 | 
251 | # Aha, `rand()` isn't (currently) threadsafe! It's mutating (and reading) some global each
252 | # time to figure out what to get next. This leads to slowdowns — and worse — it
253 | # skews the generated distribution of random numbers since some are repeated!!
254 | #
255 | # Note: on the upcoming Julia 1.3 it is now threadsafe by default! Here's how
256 | # we can emulate it on prior versions:
257 | 
258 | const ThreadRNG = Vector{Random.MersenneTwister}(undef, nthreads())
259 | @threads for i in 1:nthreads()
260 |     ThreadRNG[Threads.threadid()] = Random.MersenneTwister()
261 | end
262 | function threadedpi2(n)
263 |     inside = zeros(Int, nthreads())
264 |     len, rem = divrem(n, nthreads())
265 |     rem == 0 || error("use a multiple of $(nthreads()), please!")
266 |     @threads for i in 1:nthreads()
267 |         rng = ThreadRNG[threadid()]
268 |         v = 0
269 |         for j in 1:len
270 |             x, y = rand(rng), rand(rng)
271 |             v += (x^2 + y^2 <= 1)
272 |         end
273 |         inside[threadid()] = v
274 |     end
275 |     return 4 * sum(inside) / n
276 | end
277 | threadedpi2(240)
278 | @time threadedpi2(100_000_000)
279 | 
280 | # As an aside, be careful about initializing many `MersenneTwister`s with
281 | # different states. Better to use [`randjump`](https://docs.julialang.org/en/v1/manual/parallel-computing/#Side-effects-and-mutable-function-arguments-1) to skip ahead for a single state.
282 | 
283 | #-
284 | 
285 | # # Beware oversubscription
286 | #
287 | # Remember how BLAS is threaded by default? What happens if we try to `@threads`
288 | # something that uses BLAS?
289 | 
290 | Ms = [rand(1000, 1000) for _ in 1:100]
291 | function serial_matmul(As)
292 |     first_idxs = zeros(length(As))
293 |     for i in eachindex(As)
294 |         @inbounds first_idxs[i] = (As[i]'*As[i])[1]
295 |     end
296 |     first_idxs
297 | end
298 | serial_matmul(Ms[1:1]);
299 | @time serial_matmul(Ms);
300 | 
301 | #-
302 | 
303 | using LinearAlgebra
304 | BLAS.set_num_threads(nthreads()) # Explicitly tell BLAS to use the same number of threads
305 | function threaded_matmul(As)
306 |     first_idxs = zeros(length(As))
307 |     @threads for i in eachindex(As)
308 |         @inbounds first_idxs[i] = (As[i]'*As[i])[1]
309 |     end
310 |     first_idxs
311 | end
312 | threaded_matmul(Ms[1:1])
313 | @time threaded_matmul(Ms);
314 | 
315 | #-
316 | 
317 | BLAS.set_num_threads(1)
318 | @time threaded_matmul(Ms);
319 | 
320 | #-
321 | 
322 | @time serial_matmul(Ms) # Again, now that BLAS has just 1 thread
323 | 
324 | # # Beware "false sharing"
325 | 
326 | #-
327 | 
328 | # Remember the memory latency table?
329 | #
330 | #
331 | # | System Event                   | Actual Latency | Scaled Latency |                          |
332 | # | ------------------------------ | -------------- | -------------- | ------------------------ |
333 | # | One CPU cycle                  |     0.4 ns     |     1 s        | ← work happens here     |
334 | # | Level 1 cache access           |     0.9 ns     |     2 s        |                          |
335 | # | Level 2 cache access           |     2.8 ns     |     7 s        |                          |
336 | # | Level 3 cache access           |      28 ns     |     1 min      |                          |
337 | # | Main memory access (DDR DIMM)  |    ~100 ns     |     4 min      | ← we have control here  |
338 | #
339 | # This is what a typical modern cpu looks like:
340 | #
341 | # ![Intel Core i7](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/i7.jpg)
342 | #
343 | # Multiple cores on the same processor share the L3 cache, but do not share L1 and L2 caches! So what happens if we're accessing and mutating data from the same array across multiple cores?
344 | #
345 | # ![Cache coherency](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/false-sharing.gif)
346 | #
347 | # Unlike "true" sharing — which we saw above — false sharing will still return the correct answer! But it does so at the cost of performance. The cores recognize they don't have exclusive access to the cache line and so upon modification they alert all other cores to invalidate and re-fetch the data.
348 | 
349 | function test(spacing)
350 |     a = zeros(Threads.nthreads()*spacing)
351 |     b = rand(1000000)
352 |     calls = zeros(Threads.nthreads()*spacing)
353 |     @threads for i in eachindex(b)
354 |         a[Threads.threadid()*spacing] += b[i]
355 |         calls[Threads.threadid()*spacing] += 1
356 |     end
357 |     a, calls
358 | end
359 | @benchmark test(1);
360 | @benchmark test(8);
361 | 
362 | #-
363 | 
364 | # ## Further improvements coming here!
365 | #
366 | # PARTR — the threading improvement I discussed at the beginning aims to address
367 | # this problem of having library functions implemented with `@threads` and then
368 | # having callers call them with `@threads`. Uses a state-of-the-art work queue
369 | # mechanism to make sure that all threads stay busy.
370 | 
371 | #-
372 | 
373 | # # Threading takeaways:
374 | #
375 | # * It's easy! Just start Julia with `JULIA_NUM_THREADS` and tack a `@threads` on your loop
376 | # * Well, not so fast
377 | #     * Be aware of your hardware to set `JULIA_NUM_THREADS` appropiately
378 | #     * Beware shared state (for both performance and correctness)
379 | #     * Beware global state (but the built-in global state is improving!)
380 | #     * Beware false sharing (especially with multiple processor chips)
381 | # * We need to think carefully about how to design parallel algorithms!
382 | 


--------------------------------------------------------------------------------
/070 Distributed.jl:
--------------------------------------------------------------------------------
  1 | import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
  2 | 
  3 | # # Distributed (or multi-core or multi-process) parallelism
  4 | #
  5 | # Julia has a built-in standard library — Distributed — that allows you to
  6 | # start and run multiple concurrent Julia processes. Imagine starting a slew
  7 | # of Julia instances and then having an easy way to run code on each and every
  8 | # one of them; that's what Distributed provides.
  9 | #
 10 | # ![](https://raw.githubusercontent.com/JuliaComputing/JuliaAcademyData.jl/master/courses/Parallel_Computing/images/Julia6x.png)
 11 | 
 12 | using Distributed
 13 | nprocs()
 14 | 
 15 | import Hwloc
 16 | n = Hwloc.num_physical_cores()
 17 | 
 18 | #-
 19 | 
 20 | addprocs(n, exeflags=`--project=$@__DIR__`)
 21 | nprocs()
 22 | 
 23 | #-
 24 | 
 25 | myid()
 26 | 
 27 | # Now we can easily communicate with the other nodes:
 28 | 
 29 | r = @spawnat 2 (myid(), rand())
 30 | 
 31 | #-
 32 | 
 33 | fetch(r)
 34 | 
 35 | # This works kinda like an `@async` task!
 36 | 
 37 | @time r = @spawnat 2 (sleep(1), rand())
 38 | @time fetch(r)
 39 | 
 40 | # So we can repeat the same examples from tasks:
 41 | 
 42 | @time for w in workers()
 43 |     @spawnat w sleep(1)
 44 | end
 45 | 
 46 | #-
 47 | 
 48 | @time @sync for w in workers()
 49 |     @spawnat w sleep(1)
 50 | end
 51 | 
 52 | # Except unlike tasks, we're executing the code on a separate process — which
 53 | # can be performed on a different processor in parallel!
 54 | 
 55 | @everywhere function work(N)
 56 |     series = 1.0
 57 |     for i in 1:N
 58 |         series += (isodd(i) ? -1 : 1) / (i*2+1)
 59 |     end
 60 |     return 4*series
 61 | end
 62 | 
 63 | #-
 64 | 
 65 | @time work(1_000_000_000)
 66 | @time @sync for i in workers()
 67 |     @spawnat i work(1_000_000_000)
 68 | end
 69 | 
 70 | # Of course, this isn't very helpful. We're just performing exactly the same
 71 | # calculation on every worker... and then completely ignoring the result! Let's
 72 | # restructure our computation to be a bit more parallel friendly:
 73 | 
 74 | @everywhere function partial_pi(r)
 75 |     series = 0.0
 76 |     for i in r
 77 |         series += (isodd(i) ? -1 : 1) / (i*2+1)
 78 |     end
 79 |     return 4*series
 80 | end
 81 | a = partial_pi(0:999)
 82 | a, a-pi
 83 | 
 84 | #-
 85 | 
 86 | b = partial_pi(1000:9999)
 87 | (a + b), (a+b) - pi
 88 | 
 89 | # So now we can distribute this computation across our many workers!
 90 | 
 91 | r = 0:1_000_000_000
 92 | futures = Array{Future}(undef, nworkers())
 93 | @time begin
 94 |     for (i, id) in enumerate(workers())
 95 |         batch = 0:length(r)÷nworkers()-1
 96 |         futures[i] = @spawnat id partial_pi(batch .+ (i-1)*(length(r)÷nworkers()))
 97 |     end
 98 |     p = sum(fetch.(futures))
 99 | end
100 | p - pi
101 | 
102 | # But that's rather annoying — needing to carefully divide up our workflow and
103 | # manually collect all our results and such.  There's an easier way:
104 | 
105 | @time p = @distributed (+) for r in [(0:9999) .+ offset for offset in 0:10000:r[end]-1]
106 |     partial_pi(r)
107 | end
108 | p - pi
109 | 
110 | # Why is this different from `@threads for` and `@simd for`? Why not just
111 | # `@distributed for`?  Why the `@distributed (+) for`?
112 | 
113 | #-
114 | 
115 | # ## Data movement
116 | 
117 | #-
118 | 
119 | # Remember: Moving data is _expensive_!
120 | #
121 | # | System Event                   | Actual Latency | Scaled Latency |
122 | # | ------------------------------ | -------------- | -------------- |
123 | # | One CPU cycle                  |     0.4 ns     |     1 s        |
124 | # | Level 1 cache access           |     0.9 ns     |     2 s        |
125 | # | Level 2 cache access           |     2.8 ns     |     7 s        |
126 | # | Level 3 cache access           |      28 ns     |     1 min      |
127 | # | Main memory access (DDR DIMM)  |    ~100 ns     |     4 min      |
128 | # | Intel Optane memory access     |     <10 μs     |     7 hrs      |
129 | # | NVMe SSD I/O                   |     ~25 μs     |    17 hrs      |
130 | # | SSD I/O                        |  50–150 μs     | 1.5–4 days     |
131 | # | Rotational disk I/O            |    1–10 ms     |   1–9 months   |
132 | # | Internet call: SF to NYC       |      65 ms     |     5 years    |
133 | # | Internet call: SF to Hong Kong |     141 ms     |    11 years    |
134 | #
135 | # You really don't want to be taking a trip to the moon very frequently.
136 | # Communication between processes can indeed be as expensive as hitting a disk —
137 | # sometimes they're even implemented that way.
138 | #
139 | # So that's why Julia has special support for reductions built in to the
140 | # `@distributed` macro: each worker can do its own (intermediate) reduction
141 | # before returning just one value to our master node.
142 | 
143 | #-
144 | 
145 | # But sometimes you need to see those intermediate values. If you have a
146 | # very expensive computation relative to the communication overhead, there are
147 | # several ways to do this. The easiest is `pmap`:
148 | 
149 | @time pmap(partial_pi, [(0:99999) .+ offset for offset in 0:100000:r[end]-1])
150 | 
151 | # But if we have a large computation relative to the number of return values,
152 | # pmap is great and easy.
153 | #
154 | # Increase the work on each worker by 100x and reduce the amount of communication by 100x:
155 | 
156 | @time pmap(partial_pi, [(0:9999999) .+ offset for offset in 0:10000000:r[end]-1])
157 | 
158 | # There are other ways of doing this, though, too — we'll get to them in a minute.
159 | # But first, there's something else that I glossed over: the `@everywhere`s above.
160 | 
161 | #-
162 | 
163 | # ## Code movement
164 | 
165 | #-
166 | 
167 | # Each node is _completely_ independent; it's like starting brand new, separate
168 | # Julia processes yourself. By default, `addprocs()` just launches the
169 | # appropriate number of workers for the current workstation that you're on, but
170 | # you can easily connect them to remote machines via SSH or even through cluster
171 | # managers.
172 | 
173 | #-
174 | 
175 | # Those `@everywhere`s above are very important! They run the given expression
176 | # on all workers to make sure the state between them is consistent. Without it,
177 | # you'll see errors like this:
178 | 
179 | hello() = "hello world"
180 | r = @spawnat 2 hello()
181 | 
182 | fetch(r)
183 | 
184 | # Note that this applies to packages, too!
185 | 
186 | using Statistics # The Statistics stdlib defines mean
187 | fetch(@spawnat 2 mean(rand(100_000)))
188 | 
189 | #-
190 | 
191 | @everywhere using Statistics
192 | fetch(@spawnat 2 mean(rand(100_000)))
193 | 
194 | # # Other ways to structure and/or share data between processes
195 | #
196 | # Unlike `@threads`, we no longer have access to the same memory. While this
197 | # does make expressing some algorithms a little more tricky, the "default"
198 | # is much safer! There isn't any shared state to begin with, so it's harder
199 | # to write an incorrect algorithm. It's also just harder to write some
200 | # algorithms in the first place.
201 | #
202 | # So there are some special array types that can help bridge the gap between
203 | # processes and make writing parallel code a bit easier.
204 | 
205 | #-
206 | 
207 | # ## The `SharedArray`
208 | #
209 | # If all workers are on the same physical machine, while they cannot share
210 | # memory, they do all have shared access to the same hard drive(s)!
211 | #
212 | # The `SharedArray` makes use of this fact, allowing concurrent accesses to the
213 | # same array — somewhat akin to threads default state.
214 | #
215 | # This is the prefix definition from the "thinking in parallel" course:
216 | #
217 | # ```
218 | # using .Threads
219 | # function prefix_threads!(y, ⊕)
220 | #     l=length(y)
221 | #     k=ceil(Int, log2(l))
222 | #     for j=1:k
223 | #         @threads for i=2^j:2^j:min(l, 2^k)       #"reduce"
224 | #             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
225 | #         end
226 | #     end
227 | #     for j=(k-1):-1:1
228 | #         @threads for i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
229 | #             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
230 | #         end
231 | #     end
232 | #     y
233 | # end
234 | # ```
235 | 
236 | using SharedArrays
237 | function prefix!(⊕, y::SharedArray)
238 |     l=length(y)
239 |     k=ceil(Int, log2(l))
240 |     for j=1:k
241 |         @distributed for i=2^j:2^j:min(l, 2^k)       #"reduce"
242 |             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
243 |         end
244 |     end
245 |     for j=(k-1):-1:1
246 |         @distributed for i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
247 |             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
248 |         end
249 |     end
250 |     y
251 | end
252 | data = rand(1_000_000);
253 | A = SharedArray(data);
254 | 
255 | #-
256 | 
257 | prefix!(+, copy(A)) # compile
258 | @time prefix!(+, A);
259 | 
260 | #-
261 | 
262 | A ≈ cumsum(data)
263 | 
264 | # What went wrong?
265 | 
266 | function prefix!(⊕, y::SharedArray)
267 |     l=length(y)
268 |     k=ceil(Int, log2(l))
269 |     for j=1:k
270 |         @sync @distributed for i=2^j:2^j:min(l, 2^k)       #"reduce"
271 |             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
272 |         end
273 |     end
274 |     for j=(k-1):-1:1
275 |         @sync @distributed for i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
276 |             @inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
277 |         end
278 |     end
279 |     y
280 | end
281 | A = SharedArray(data)
282 | @time prefix!(+, A)
283 | 
284 | #-
285 | 
286 | A ≈ cumsum(data)
287 | 
288 | # ## DistributedArrays
289 | #
290 | # We can, though, turn the problem on its head and allow the _data_ itself
291 | # to determine how the problem gets split up. This can save us tons of indexing
292 | # headaches.
293 | 
294 | @everywhere using Distributed
295 | using DistributedArrays
296 | @everywhere using DistributedArrays
297 | A = DArray(I->fill(myid(), length.(I)), (24, 24))
298 | 
299 | # The first argument takes a function that transforms the given set of indices
300 | # to the _local portion_ of the distributed array.
301 | 
302 | A = DArray((24,24)) do I
303 |     @show I
304 |     fill(myid(), length.(I))
305 | end
306 | 
307 | # Notice that none of the array actually lives on processor 1, but we can still
308 | # display the contents — when we do we're requesting all workers give us their
309 | # current data! While we've only talked about master-worker communcation so far,
310 | # workers can communicate directly amongst themselves, too (by default).
311 | 
312 | using BenchmarkTools
313 | @everywhere using BenchmarkTools
314 | fetch(@spawnat 2 @benchmark $A[1,1])
315 | 
316 | #-
317 | 
318 | fetch(@spawnat 2 @benchmark $A[end,end])
319 | 
320 | # So it's fastest to work on a `DArray`'s "local" portion, but it's _possible_
321 | # to grab other data if need be. This is perfect for any sort of tiled operation
322 | # that works on neighboring values (like image filtering/convolution). Or Conway's
323 | # game of life!
324 | 
325 | function life_step(d::DArray)
326 |     DArray(size(d),procs(d)) do I
327 |         # Compute the indices of the outside edge (that will come from other processors)
328 |         top   = mod1(first(I[1])-1,size(d,1))
329 |         bot   = mod1( last(I[1])+1,size(d,1))
330 |         left  = mod1(first(I[2])-1,size(d,2))
331 |         right = mod1( last(I[2])+1,size(d,2))
332 |         # Create a new, temporary array that holds the local part + outside edge
333 |         old = Array{Bool}(undef, length(I[1])+2, length(I[2])+2)
334 |         # These accesses will pull data from other processors
335 |         old[1      , 1      ] = d[top , left]
336 |         old[2:end-1, 1      ] = d[I[1], left]   # left side (and corners)
337 |         old[end    , 1      ] = d[bot , left]
338 |         old[1      , end    ] = d[top , right]
339 |         old[2:end-1, end    ] = d[I[1], right]  # right side (and corners)
340 |         old[end    , end    ] = d[bot , right]
341 |         old[1      , 2:end-1] = d[top , I[2]]   # top
342 |         old[end    , 2:end-1] = d[bot , I[2]]   # bottom
343 |         # But this big one is all local!
344 |         old[2:end-1, 2:end-1] = d[I[1], I[2]]   # middle
345 | 
346 |         life_rule(old) # Compute the new segment!
347 |     end
348 | end
349 | @everywhere function life_rule(old)
350 |     # Now this part — the computational part — is entirely local and on Arrays!
351 |     m, n = size(old)
352 |     new = similar(old, m-2, n-2)
353 |     for j = 2:n-1
354 |         @inbounds for i = 2:m-1
355 |             nc = (+)(old[i-1,j-1], old[i-1,j], old[i-1,j+1],
356 |                      old[i  ,j-1],             old[i  ,j+1],
357 |                      old[i+1,j-1], old[i+1,j], old[i+1,j+1])
358 |             new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j])
359 |         end
360 |     end
361 |     new
362 | end
363 | 
364 | #-
365 | 
366 | A = DArray(I->rand(Bool, length.(I)), (20,20))
367 | @everywhere using Colors
368 | Gray.(A)
369 | 
370 | #-
371 | 
372 | B = copy(A)
373 | 
374 | #-
375 | 
376 | B = Gray.(life_step(B))
377 | B
378 | #-
379 | 
380 | # ## Clusters and more ways to distribute
381 | #
382 | # You can easily connect to completely separate machines with SSH access built in!
383 | # But there are many other ways to connect to clusters:
384 | #
385 | # * [JuliaRun](https://juliacomputing.com/products/juliarun)
386 | # * [Kubernetes](https://juliacomputing.com/blog/2018/12/15/kuber.html)
387 | # * [MPI](https://github.com/JuliaParallel/MPI.jl)
388 | # * [Cluster job queues with ClusterManagers](https://github.com/JuliaParallel/ClusterManagers.jl)
389 | # * [Hadoop](https://github.com/JuliaParallel/Elly.jl)
390 | # * [Spark](https://github.com/dfdx/Spark.jl)
391 | 
392 | #-
393 | 
394 | # # Multi-process parallelism is the heavy-duty workhorse in Julia
395 | #
396 | # It can tackle very large problems and distribute across a very large number
397 | # of workers. Key things to remember
398 | #
399 | # * Each worker is a completely independent Julia process
400 | #     * Data must move to them
401 | #     * Code must move to them
402 | # * Structure your algorithms and use a distributed mechanism that fits with the
403 | #   time and memory parameters of your problem
404 | #     * `@distributed` can be good for reductions and even relatively fast inner loops with limited (or no) explicit data transfer
405 | #     * `pmap` is great for very expensive inner loops that return a value
406 | #     * `SharedArray`s can be an easier drop-in replacement for threading-like behaviors (on a single machine)
407 | #     * `DistributedArray`s can turn the problem on its head and let the data do the work splitting!
408 | 


--------------------------------------------------------------------------------
/Manifest.toml:
--------------------------------------------------------------------------------
  1 | # This file is machine-generated - editing it directly is not advised
  2 | 
  3 | [[AbstractFFTs]]
  4 | deps = ["LinearAlgebra"]
  5 | git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
  6 | uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
  7 | version = "0.4.1"
  8 | 
  9 | [[AbstractTrees]]
 10 | deps = ["Markdown", "Test"]
 11 | git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
 12 | uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 13 | version = "0.2.1"
 14 | 
 15 | [[Adapt]]
 16 | deps = ["LinearAlgebra", "Test"]
 17 | git-tree-sha1 = "53d8fec4f662088c1202530e338a11a919407f3b"
 18 | uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 19 | version = "0.4.2"
 20 | 
 21 | [[Arpack]]
 22 | deps = ["BinaryProvider", "Libdl", "LinearAlgebra"]
 23 | git-tree-sha1 = "07a2c077bdd4b6d23a40342a8a108e2ee5e58ab6"
 24 | uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
 25 | version = "0.3.1"
 26 | 
 27 | [[AxisAlgorithms]]
 28 | deps = ["LinearAlgebra", "Random", "SparseArrays", "WoodburyMatrices"]
 29 | git-tree-sha1 = "a4d07a1c313392a77042855df46c5f534076fab9"
 30 | uuid = "13072b0f-2c55-5437-9ae7-d433b7a33950"
 31 | version = "1.0.0"
 32 | 
 33 | [[Base64]]
 34 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 35 | 
 36 | [[BenchmarkTools]]
 37 | deps = ["JSON", "Printf", "Statistics", "Test"]
 38 | git-tree-sha1 = "5d1dd8577643ba9014574cd40d9c028cd5e4b85a"
 39 | uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 40 | version = "0.4.2"
 41 | 
 42 | [[BinDeps]]
 43 | deps = ["Compat", "Libdl", "SHA", "URIParser"]
 44 | git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
 45 | uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
 46 | version = "0.8.10"
 47 | 
 48 | [[BinaryProvider]]
 49 | deps = ["Compat", "CredentialsHandler", "Libdl", "Pkg", "SHA", "TOML", "Test"]
 50 | uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
 51 | 
 52 | [[CSTParser]]
 53 | deps = ["Tokenize"]
 54 | git-tree-sha1 = "376a39f1862000442011390f1edf5e7f4dcc7142"
 55 | uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
 56 | version = "0.6.0"
 57 | 
 58 | [[CUDAapi]]
 59 | deps = ["Libdl", "Logging", "Test"]
 60 | git-tree-sha1 = "125122309a4387e0d18787cef0f03800fa57702a"
 61 | uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 62 | version = "0.6.3"
 63 | 
 64 | [[CUDAdrv]]
 65 | deps = ["CUDAapi", "Libdl", "Printf"]
 66 | git-tree-sha1 = "9b2d99981b984378799ec70dd005cb7e7b4e914c"
 67 | uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 68 | version = "3.0.1"
 69 | 
 70 | [[CUDAnative]]
 71 | deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"]
 72 | git-tree-sha1 = "36cbb94f74cd3e5db774134a68dc5d033ae2c87e"
 73 | uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
 74 | version = "2.2.1"
 75 | 
 76 | [[Calculus]]
 77 | deps = ["Compat"]
 78 | git-tree-sha1 = "f60954495a7afcee4136f78d1d60350abd37a409"
 79 | uuid = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9"
 80 | version = "0.4.1"
 81 | 
 82 | [[CategoricalArrays]]
 83 | deps = ["Compat", "Future", "JSON", "Missings", "Printf", "Reexport"]
 84 | git-tree-sha1 = "26601961df6afacdd16d67c1eec6cfe75e5ae9ab"
 85 | uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 86 | version = "0.5.4"
 87 | 
 88 | [[CodecZlib]]
 89 | deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
 90 | git-tree-sha1 = "36bbf5374c661054d41410dc53ff752972583b9b"
 91 | uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
 92 | version = "0.5.2"
 93 | 
 94 | [[ColorTypes]]
 95 | deps = ["FixedPointNumbers", "Random"]
 96 | git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
 97 | uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
 98 | version = "0.8.0"
 99 | 
100 | [[Colors]]
101 | deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"]
102 | git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543"
103 | uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
104 | version = "0.9.5"
105 | 
106 | [[CommonSubexpressions]]
107 | deps = ["Test"]
108 | git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
109 | uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
110 | version = "0.2.0"
111 | 
112 | [[Compat]]
113 | deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
114 | git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
115 | uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
116 | version = "2.1.0"
117 | 
118 | [[Compose]]
119 | deps = ["Base64", "Colors", "DataStructures", "Dates", "IterTools", "JSON", "LinearAlgebra", "Measures", "Printf", "Random", "Requires", "Test", "UUIDs"]
120 | git-tree-sha1 = "7d8fe0ad6f73c40ccc4e01f426a700c5a843a1d3"
121 | uuid = "a81c6b42-2e10-5240-aca2-a61377ecd94b"
122 | version = "0.7.3"
123 | 
124 | [[Conda]]
125 | deps = ["JSON", "VersionParsing"]
126 | git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
127 | uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
128 | version = "1.3.0"
129 | 
130 | [[Contour]]
131 | deps = ["LinearAlgebra", "StaticArrays", "Test"]
132 | git-tree-sha1 = "b974e164358fea753ef853ce7bad97afec15bb80"
133 | uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
134 | version = "0.5.1"
135 | 
136 | [[CoupledFields]]
137 | deps = ["Compat", "StatsBase"]
138 | git-tree-sha1 = "d56f26542bb7af9c0ec16e098a0a33352f3c9d8e"
139 | uuid = "7ad07ef1-bdf2-5661-9d2b-286fd4296dac"
140 | version = "0.1.0"
141 | 
142 | [[Crayons]]
143 | deps = ["Test"]
144 | git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
145 | uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
146 | version = "4.0.0"
147 | 
148 | [[CredentialsHandler]]
149 | deps = ["Base64", "HTTP", "TOML"]
150 | uuid = "864e158e-919d-11e8-198e-cfe890ec4681"
151 | 
152 | [[CuArrays]]
153 | deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
154 | git-tree-sha1 = "f95cbe4fe78b1fff00691aa1d2e533542f095358"
155 | uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
156 | version = "1.0.2"
157 | 
158 | [[DataStructures]]
159 | deps = ["InteractiveUtils", "OrderedCollections"]
160 | git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
161 | uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
162 | version = "0.17.0"
163 | 
164 | [[Dates]]
165 | deps = ["Printf"]
166 | uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
167 | 
168 | [[DelimitedFiles]]
169 | deps = ["Mmap"]
170 | uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
171 | 
172 | [[DiffEqDiffTools]]
173 | deps = ["LinearAlgebra", "SparseArrays", "StaticArrays"]
174 | git-tree-sha1 = "b992345a39b4d9681342ae795a8dacc100730182"
175 | uuid = "01453d9d-ee7c-5054-8395-0335cb756afa"
176 | version = "0.14.0"
177 | 
178 | [[DiffResults]]
179 | deps = ["Compat", "StaticArrays"]
180 | git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
181 | uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
182 | version = "0.0.4"
183 | 
184 | [[DiffRules]]
185 | deps = ["Random", "Test"]
186 | git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
187 | uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
188 | version = "0.0.10"
189 | 
190 | [[Distances]]
191 | deps = ["LinearAlgebra", "Printf", "Random", "Statistics", "Test"]
192 | git-tree-sha1 = "a135c7c062023051953141da8437ed74f89d767a"
193 | uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
194 | version = "0.8.0"
195 | 
196 | [[Distributed]]
197 | deps = ["Random", "Serialization", "Sockets"]
198 | uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
199 | 
200 | [[DistributedArrays]]
201 | deps = ["Distributed", "LinearAlgebra", "Primes", "Random", "Serialization", "SparseArrays", "Statistics"]
202 | git-tree-sha1 = "9b4689b8d49b42351d518431ff642ed29cedd6d4"
203 | uuid = "aaf54ef3-cdf8-58ed-94cc-d582ad619b94"
204 | version = "0.6.2"
205 | 
206 | [[Distributions]]
207 | deps = ["LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"]
208 | git-tree-sha1 = "56a158bc0abe4af5d4027af2275fde484261ca6d"
209 | uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
210 | version = "0.19.2"
211 | 
212 | [[DocStringExtensions]]
213 | deps = ["LibGit2", "Markdown", "Pkg", "Test"]
214 | git-tree-sha1 = "0513f1a8991e9d83255e0140aace0d0fc4486600"
215 | uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
216 | version = "0.8.0"
217 | 
218 | [[FFTW]]
219 | deps = ["AbstractFFTs", "BinaryProvider", "Compat", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
220 | git-tree-sha1 = "29cda58afbf62f35b1a094882ad6c745a47b2eaa"
221 | uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
222 | version = "0.2.4"
223 | 
224 | [[FileWatching]]
225 | uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
226 | 
227 | [[FillArrays]]
228 | deps = ["LinearAlgebra", "Random", "SparseArrays", "Test"]
229 | git-tree-sha1 = "9ab8f76758cbabba8d7f103c51dce7f73fcf8e92"
230 | uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
231 | version = "0.6.3"
232 | 
233 | [[FixedPointNumbers]]
234 | git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
235 | uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
236 | version = "0.6.1"
237 | 
238 | [[Flux]]
239 | deps = ["AbstractTrees", "Adapt", "CodecZlib", "Colors", "DelimitedFiles", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SHA", "Statistics", "StatsBase", "Tracker", "ZipFile"]
240 | git-tree-sha1 = "08212989c2856f95f90709ea5fd824bd27b34514"
241 | uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
242 | version = "0.8.3"
243 | 
244 | [[ForwardDiff]]
245 | deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
246 | git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
247 | uuid = "f6369f11-7733-5829-9624-2563aa707210"
248 | version = "0.10.3"
249 | 
250 | [[Future]]
251 | deps = ["Random"]
252 | uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
253 | 
254 | [[GPUArrays]]
255 | deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
256 | git-tree-sha1 = "6b556af6e42b71f5712a98f8df3d110a76bfdea9"
257 | uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
258 | version = "0.7.2"
259 | 
260 | [[Gadfly]]
261 | deps = ["Base64", "CategoricalArrays", "Colors", "Compat", "Compose", "Contour", "CoupledFields", "DataStructures", "Dates", "Distributions", "DocStringExtensions", "Hexagons", "IndirectArrays", "IterTools", "JSON", "Juno", "KernelDensity", "LibGit2", "LinearAlgebra", "Loess", "Measures", "Printf", "Random", "Requires", "Showoff", "Statistics", "StatsBase", "Test"]
262 | git-tree-sha1 = "ee709588c71eb62ce53cecf5a594bdefd6f2a9be"
263 | uuid = "c91e804a-d5a3-530f-b6f0-dfbca275c004"
264 | version = "1.0.1"
265 | 
266 | [[HTTP]]
267 | deps = ["Base64", "Dates", "Distributed", "IniFile", "JSON", "MbedTLS", "Sockets", "Test"]
268 | uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
269 | 
270 | [[Hexagons]]
271 | deps = ["Test"]
272 | git-tree-sha1 = "de4a6f9e7c4710ced6838ca906f81905f7385fd6"
273 | uuid = "a1b4810d-1bce-5fbd-ac56-80944d57a21f"
274 | version = "0.2.0"
275 | 
276 | [[Hwloc]]
277 | deps = ["BinaryProvider", "Libdl"]
278 | git-tree-sha1 = "bb23d264d76b82d1da80733cbb01bad8a11ae489"
279 | uuid = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
280 | version = "1.0.3"
281 | 
282 | [[IndirectArrays]]
283 | deps = ["Compat", "Test"]
284 | git-tree-sha1 = "b6e249be10a3381b2c72ac82f2d13d70067cb2bd"
285 | uuid = "9b13fd28-a010-5f03-acff-a1bbcff69959"
286 | version = "0.5.0"
287 | 
288 | [[IniFile]]
289 | uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
290 | 
291 | [[InteractiveUtils]]
292 | deps = ["Markdown"]
293 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
294 | 
295 | [[Interpolations]]
296 | deps = ["AxisAlgorithms", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"]
297 | git-tree-sha1 = "e1bac96b5ef3ea23b50e801b4a988ec21861a47f"
298 | uuid = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
299 | version = "0.12.2"
300 | 
301 | [[IterTools]]
302 | git-tree-sha1 = "2ebe60d7343962966d1779a74a760f13217a6901"
303 | uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
304 | version = "1.2.0"
305 | 
306 | [[JSON]]
307 | deps = ["Dates", "Distributed", "Mmap", "Sockets", "Test", "Unicode"]
308 | git-tree-sha1 = "1f7a25b53ec67f5e9422f1f551ee216503f4a0fa"
309 | uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
310 | version = "0.20.0"
311 | 
312 | [[Juno]]
313 | deps = ["Base64", "Logging", "Media", "Profile", "Test"]
314 | git-tree-sha1 = "4e4a8d43aa7ecec66cadaf311fbd1e5c9d7b9175"
315 | uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
316 | version = "0.7.0"
317 | 
318 | [[KernelDensity]]
319 | deps = ["Distributions", "FFTW", "Interpolations", "Optim", "StatsBase", "Test"]
320 | git-tree-sha1 = "c1048817fe5711f699abc8fabd47b1ac6ba4db04"
321 | uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
322 | version = "0.5.1"
323 | 
324 | [[LLVM]]
325 | deps = ["Libdl", "Printf", "Unicode"]
326 | git-tree-sha1 = "7fafc370730b515a6273046a53cbb548ef3e38f7"
327 | uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
328 | version = "1.1.1"
329 | 
330 | [[LibGit2]]
331 | uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
332 | 
333 | [[Libdl]]
334 | uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
335 | 
336 | [[LineSearches]]
337 | deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf", "Test"]
338 | git-tree-sha1 = "54eb90e8dbe745d617c78dee1d6ae95c7f6f5779"
339 | uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
340 | version = "7.0.1"
341 | 
342 | [[LinearAlgebra]]
343 | deps = ["Libdl"]
344 | uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
345 | 
346 | [[Loess]]
347 | deps = ["Distances", "Random", "Statistics", "Test"]
348 | git-tree-sha1 = "0ee46caf683a422b595be4dfaed6cda28f541e25"
349 | uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612"
350 | version = "0.5.0"
351 | 
352 | [[Logging]]
353 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
354 | 
355 | [[MacroTools]]
356 | deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
357 | git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
358 | uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
359 | version = "0.5.1"
360 | 
361 | [[Markdown]]
362 | deps = ["Base64"]
363 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
364 | 
365 | [[MbedTLS]]
366 | deps = ["BinaryProvider", "Dates", "Libdl", "Pkg", "Random", "Sockets"]
367 | uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
368 | 
369 | [[Measures]]
370 | deps = ["Test"]
371 | git-tree-sha1 = "ddfd6d13e330beacdde2c80de27c1c671945e7d9"
372 | uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
373 | version = "0.3.0"
374 | 
375 | [[Media]]
376 | deps = ["MacroTools", "Test"]
377 | git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58"
378 | uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
379 | version = "0.5.0"
380 | 
381 | [[Missings]]
382 | deps = ["SparseArrays", "Test"]
383 | git-tree-sha1 = "f0719736664b4358aa9ec173077d4285775f8007"
384 | uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
385 | version = "0.4.1"
386 | 
387 | [[Mmap]]
388 | uuid = "a63ad114-7e13-5084-954f-fe012c677804"
389 | 
390 | [[NLSolversBase]]
391 | deps = ["Calculus", "DiffEqDiffTools", "DiffResults", "Distributed", "ForwardDiff", "LinearAlgebra", "Random", "SparseArrays", "Test"]
392 | git-tree-sha1 = "0c6f0e7f2178f78239cfb75310359eed10f2cacb"
393 | uuid = "d41bc354-129a-5804-8e4c-c37616107c6c"
394 | version = "7.3.1"
395 | 
396 | [[NNlib]]
397 | deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
398 | git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
399 | uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
400 | version = "0.6.0"
401 | 
402 | [[NaNMath]]
403 | deps = ["Compat"]
404 | git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
405 | uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
406 | version = "0.3.2"
407 | 
408 | [[OffsetArrays]]
409 | git-tree-sha1 = "1af2f79c7eaac3e019a0de41ef63335ff26a0a57"
410 | uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
411 | version = "0.11.1"
412 | 
413 | [[Optim]]
414 | deps = ["Calculus", "DiffEqDiffTools", "ForwardDiff", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "Random", "SparseArrays", "StatsBase", "Test"]
415 | git-tree-sha1 = "a626e09c1f7f019b8f3a30a8172c7b82d2f4810b"
416 | uuid = "429524aa-4258-5aef-a3af-852621145aeb"
417 | version = "0.18.1"
418 | 
419 | [[OrderedCollections]]
420 | deps = ["Random", "Serialization", "Test"]
421 | git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
422 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
423 | version = "1.1.0"
424 | 
425 | [[PDMats]]
426 | deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"]
427 | git-tree-sha1 = "8b68513175b2dc4023a564cb0e917ce90e74fd69"
428 | uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
429 | version = "0.9.7"
430 | 
431 | [[Parameters]]
432 | deps = ["Markdown", "OrderedCollections", "REPL", "Test"]
433 | git-tree-sha1 = "70bdbfb2bceabb15345c0b54be4544813b3444e4"
434 | uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
435 | version = "0.10.3"
436 | 
437 | [[Pkg]]
438 | deps = ["BinaryProvider", "CredentialsHandler", "Dates", "HTTP", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "TOML", "UUIDs"]
439 | uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
440 | 
441 | [[PositiveFactorizations]]
442 | deps = ["LinearAlgebra", "Test"]
443 | git-tree-sha1 = "957c3dd7c33895469760ce873082fbb6b3620641"
444 | uuid = "85a6dd25-e78a-55b7-8502-1745935b8125"
445 | version = "0.2.2"
446 | 
447 | [[Primes]]
448 | deps = ["Test"]
449 | git-tree-sha1 = "ff1a2323cb468ec5f201838fcbe3c232266b1f95"
450 | uuid = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
451 | version = "0.4.0"
452 | 
453 | [[Printf]]
454 | deps = ["Unicode"]
455 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
456 | 
457 | [[Profile]]
458 | deps = ["Printf"]
459 | uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
460 | 
461 | [[PyCall]]
462 | deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Serialization", "Statistics", "Test", "VersionParsing"]
463 | git-tree-sha1 = "6e5bac1b1faf3575731a6a5b76f638f2389561d3"
464 | uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
465 | version = "1.91.2"
466 | 
467 | [[QuadGK]]
468 | deps = ["DataStructures", "LinearAlgebra", "Test"]
469 | git-tree-sha1 = "3ce467a8e76c6030d4c3786e7d3a73442017cdc0"
470 | uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
471 | version = "2.0.3"
472 | 
473 | [[REPL]]
474 | deps = ["InteractiveUtils", "Markdown", "Sockets"]
475 | uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
476 | 
477 | [[Random]]
478 | deps = ["Serialization"]
479 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
480 | 
481 | [[Ratios]]
482 | deps = ["Compat"]
483 | git-tree-sha1 = "cdbbe0f350581296f3a2e3e7a91b214121934407"
484 | uuid = "c84ed2f1-dad5-54f0-aa8e-dbefe2724439"
485 | version = "0.3.1"
486 | 
487 | [[Reexport]]
488 | deps = ["Pkg"]
489 | git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
490 | uuid = "189a3867-3050-52da-a836-e630ba90ab69"
491 | version = "0.2.0"
492 | 
493 | [[Requires]]
494 | deps = ["Test"]
495 | git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
496 | uuid = "ae029012-a4dd-5104-9daa-d747884805df"
497 | version = "0.5.2"
498 | 
499 | [[Rmath]]
500 | deps = ["BinaryProvider", "Libdl", "Random", "Statistics", "Test"]
501 | git-tree-sha1 = "9a6c758cdf73036c3239b0afbea790def1dabff9"
502 | uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
503 | version = "0.5.0"
504 | 
505 | [[SHA]]
506 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
507 | 
508 | [[Serialization]]
509 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
510 | 
511 | [[SharedArrays]]
512 | deps = ["Distributed", "Mmap", "Random", "Serialization"]
513 | uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
514 | 
515 | [[Showoff]]
516 | deps = ["Dates"]
517 | git-tree-sha1 = "e032c9df551fb23c9f98ae1064de074111b7bc39"
518 | uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f"
519 | version = "0.3.1"
520 | 
521 | [[Sockets]]
522 | uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
523 | 
524 | [[SortingAlgorithms]]
525 | deps = ["DataStructures", "Random", "Test"]
526 | git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd"
527 | uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
528 | version = "0.3.1"
529 | 
530 | [[SparseArrays]]
531 | deps = ["LinearAlgebra", "Random"]
532 | uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
533 | 
534 | [[SpecialFunctions]]
535 | deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"]
536 | git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea"
537 | uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
538 | version = "0.7.2"
539 | 
540 | [[StaticArrays]]
541 | deps = ["LinearAlgebra", "Random", "Statistics"]
542 | git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
543 | uuid = "90137ffa-7385-5640-81b9-e52037218182"
544 | version = "0.11.0"
545 | 
546 | [[Statistics]]
547 | deps = ["LinearAlgebra", "SparseArrays"]
548 | uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
549 | 
550 | [[StatsBase]]
551 | deps = ["DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
552 | git-tree-sha1 = "2b6ca97be7ddfad5d9f16a13fe277d29f3d11c23"
553 | uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
554 | version = "0.31.0"
555 | 
556 | [[StatsFuns]]
557 | deps = ["Rmath", "SpecialFunctions", "Test"]
558 | git-tree-sha1 = "b3a4e86aa13c732b8a8c0ba0c3d3264f55e6bb3e"
559 | uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
560 | version = "0.8.0"
561 | 
562 | [[SuiteSparse]]
563 | deps = ["Libdl", "LinearAlgebra", "SparseArrays"]
564 | uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
565 | 
566 | [[TOML]]
567 | deps = ["Dates"]
568 | uuid = "9d418dce-91a8-11e8-0173-7b01a971d501"
569 | 
570 | [[Test]]
571 | deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
572 | uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
573 | 
574 | [[TimerOutputs]]
575 | deps = ["Crayons", "Printf", "Test", "Unicode"]
576 | git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
577 | uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
578 | version = "0.5.0"
579 | 
580 | [[Tokenize]]
581 | git-tree-sha1 = "0de343efc07da00cd449d5b04e959ebaeeb3305d"
582 | uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
583 | version = "0.5.4"
584 | 
585 | [[Tracker]]
586 | deps = ["Adapt", "DiffRules", "ForwardDiff", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Requires", "SpecialFunctions", "Statistics", "Test"]
587 | git-tree-sha1 = "327342fec6e09f68ced0c2dc5731ed475e4b696b"
588 | uuid = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
589 | version = "0.2.2"
590 | 
591 | [[TranscodingStreams]]
592 | deps = ["Random", "Test"]
593 | git-tree-sha1 = "a25d8e5a28c3b1b06d3859f30757d43106791919"
594 | uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
595 | version = "0.9.4"
596 | 
597 | [[URIParser]]
598 | deps = ["Test", "Unicode"]
599 | git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
600 | uuid = "30578b45-9adc-5946-b283-645ec420af67"
601 | version = "0.4.0"
602 | 
603 | [[UUIDs]]
604 | deps = ["Random", "SHA"]
605 | uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
606 | 
607 | [[Unicode]]
608 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
609 | 
610 | [[VersionParsing]]
611 | deps = ["Compat"]
612 | git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
613 | uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
614 | version = "1.1.3"
615 | 
616 | [[WoodburyMatrices]]
617 | deps = ["LinearAlgebra", "Random", "SparseArrays", "Test"]
618 | git-tree-sha1 = "21772c33b447757ec7d3e61fcdfb9ea5c47eedcf"
619 | uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6"
620 | version = "0.4.1"
621 | 
622 | [[ZipFile]]
623 | deps = ["BinaryProvider", "Libdl", "Printf"]
624 | git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
625 | uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
626 | version = "0.8.3"
627 | 


--------------------------------------------------------------------------------