├── REQUIRE ├── codecov.yml ├── .gitignore ├── .travis.yml ├── src ├── DistributedArrays.jl ├── core.jl ├── serialize.jl ├── sort.jl ├── spmd.jl ├── linalg.jl ├── mapreduce.jl └── darray.jl ├── LICENSE.md ├── test ├── runtests.jl ├── spmd.jl └── darray.jl └── README.md /REQUIRE: -------------------------------------------------------------------------------- 1 | julia 0.6- 2 | Primes 3 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: off 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.cov 2 | *.jl.mem 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: julia 2 | os: 3 | - linux 4 | - osx 5 | julia: 6 | - nightly 7 | matrix: 8 | # allow_failures: 9 | # - julia: nightly 10 | notifications: 11 | email: false 12 | before_install: 13 | - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi 14 | after_success: 15 | - julia -e 'cd(Pkg.dir("DistributedArrays")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder()); Codecov.submit(Codecov.process_folder())' 16 | -------------------------------------------------------------------------------- /src/DistributedArrays.jl: -------------------------------------------------------------------------------- 1 | __precompile__(true) 2 | 3 | module DistributedArrays 4 | 5 | using Primes 6 | using Primes: factor 7 | 8 | importall Base 9 | import Base.Callable 10 | import Base.BLAS: axpy! 11 | 12 | # DArray exports 13 | export (.+), (.-), (.*), (./), (.%), (.<<), (.>>), div, mod, rem, (&), (|), ($) 14 | export DArray, SubDArray, SubOrDArray, @DArray 15 | export dzeros, dones, dfill, drand, drandn, distribute, localpart, localindexes, ppeval, samedist 16 | 17 | # non-array distributed data 18 | export ddata, gather 19 | 20 | # immediate release of localparts 21 | export close, d_closeall 22 | 23 | include("darray.jl") 24 | include("core.jl") 25 | include("serialize.jl") 26 | include("mapreduce.jl") 27 | include("linalg.jl") 28 | include("sort.jl") 29 | 30 | include("spmd.jl") 31 | export SPMD 32 | 33 | end # module 34 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The DistributedArrays.jl package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2015: Julia Parallel Contributors 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining 6 | > a copy of this software and associated documentation files (the 7 | > "Software"), to deal in the Software without restriction, including 8 | > without limitation the rights to use, copy, modify, merge, publish, 9 | > distribute, sublicense, and/or sell copies of the Software, and to 10 | > permit persons to whom the Software is furnished to do so, subject to 11 | > the following conditions: 12 | > 13 | > The above copyright notice and this permission notice shall be 14 | > included in all copies or substantial portions of the Software. 15 | > 16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | using Base.Test 2 | 3 | using DistributedArrays 4 | 5 | # add at least 3 worker processes 6 | if nworkers() < 3 7 | n = max(3, min(8, Sys.CPU_CORES)) 8 | addprocs(n; exeflags=`--check-bounds=yes`) 9 | end 10 | @assert nprocs() > 3 11 | @assert nworkers() >= 3 12 | 13 | @everywhere importall DistributedArrays 14 | @everywhere importall DistributedArrays.SPMD 15 | 16 | @everywhere srand(1234 + myid()) 17 | 18 | const MYID = myid() 19 | const OTHERIDS = filter(id-> id != MYID, procs())[rand(1:(nprocs()-1))] 20 | 21 | # On 0.6, @testset does not display the test description automatically anymore. 22 | function print_test_desc(t, n=0) 23 | println(repeat(" ", n), "Passed : ", t.description) 24 | for t2 in t.results 25 | if isa(t2, Base.Test.DefaultTestSet) 26 | print_test_desc(t2, n+2) 27 | end 28 | end 29 | end 30 | 31 | function check_leaks(t=nothing) 32 | if length(DistributedArrays.refs) > 0 33 | sleep(0.1) # allow time for any cleanup to complete and test again 34 | length(DistributedArrays.refs) > 0 && warn("Probable leak of ", length(DistributedArrays.refs), " darrays") 35 | end 36 | 37 | isa(t, Base.Test.DefaultTestSet) && print_test_desc(t) 38 | end 39 | 40 | include("darray.jl") 41 | include("spmd.jl") 42 | 43 | -------------------------------------------------------------------------------- /src/core.jl: -------------------------------------------------------------------------------- 1 | const registry=Dict{Tuple, Any}() 2 | const refs=Set() # Collection of darray identities created on this node 3 | 4 | let DID::Int = 1 5 | global next_did 6 | next_did() = (id = DID; DID += 1; (myid(), id)) 7 | end 8 | 9 | """ 10 | next_did() 11 | 12 | Produces an incrementing ID that will be used for DArrays. 13 | """ 14 | next_did 15 | 16 | release_localpart(id::Tuple) = (delete!(registry, id); nothing) 17 | release_localpart(d) = release_localpart(d.id) 18 | 19 | function close_by_id(id, pids) 20 | # @schedule println("Finalizer for : ", id) 21 | global refs 22 | @sync begin 23 | for p in pids 24 | @async remotecall_fetch(release_localpart, p, id) 25 | end 26 | if !(myid() in pids) 27 | release_localpart(id) 28 | end 29 | end 30 | delete!(refs, id) 31 | nothing 32 | end 33 | 34 | function close(d::DArray) 35 | # @schedule println("close : ", d.id, ", object_id : ", object_id(d), ", myid : ", myid() ) 36 | if (myid() == d.id[1]) && d.release 37 | @schedule close_by_id(d.id, d.pids) 38 | d.release = false 39 | end 40 | nothing 41 | end 42 | 43 | function d_closeall() 44 | crefs = copy(refs) 45 | for id in crefs 46 | if id[1] == myid() # sanity check 47 | haskey(registry, id) && close(registry[id]) 48 | yield() 49 | end 50 | end 51 | end 52 | 53 | """ 54 | procs(d::DArray) 55 | 56 | Get the vector of processes storing pieces of DArray `d`. 57 | """ 58 | Base.procs(d::DArray) = d.pids 59 | 60 | """ 61 | localpart(A) 62 | 63 | The identity when input is not distributed 64 | """ 65 | localpart(A) = A 66 | 67 | -------------------------------------------------------------------------------- /src/serialize.jl: -------------------------------------------------------------------------------- 1 | function Base.serialize{T,N,A}(S::AbstractSerializer, d::DArray{T,N,A}) 2 | # Only send the ident for participating workers - we expect the DArray to exist in the 3 | # remote registry. DO NOT send the localpart. 4 | destpid = Base.worker_id_from_socket(S.io) 5 | Serializer.serialize_type(S, typeof(d)) 6 | if (destpid in d.pids) || (destpid == d.id[1]) 7 | serialize(S, (true, d.id)) # (id_only, id) 8 | else 9 | serialize(S, (false, d.id)) 10 | for n in [:dims, :pids, :indexes, :cuts] 11 | serialize(S, getfield(d, n)) 12 | end 13 | serialize(S, A) 14 | end 15 | end 16 | 17 | function Base.deserialize{DT<:DArray}(S::AbstractSerializer, t::Type{DT}) 18 | what = deserialize(S) 19 | id_only = what[1] 20 | id = what[2] 21 | 22 | if id_only 23 | if haskey(registry, id) 24 | return registry[id] 25 | else 26 | # access to fields will throw an error, at least the deserialization process will not 27 | # result in worker death 28 | d = DT() 29 | d.id = id 30 | return d 31 | end 32 | else 33 | # We are not a participating worker, deser fields and instantiate locally. 34 | dims = deserialize(S) 35 | pids = deserialize(S) 36 | indexes = deserialize(S) 37 | cuts = deserialize(S) 38 | A = deserialize(S) 39 | T=eltype(DT) 40 | N=length(dims) 41 | return DT(id, dims, pids, indexes, cuts, empty_localpart(T,N,A)) 42 | end 43 | end 44 | 45 | # Serialize only those parts of the object as required by the destination worker. 46 | type DestinationSerializer 47 | generate::Nullable{Function} # Function to generate the part to be serialized 48 | pids::Nullable{Array} # MUST have the same shape as the distribution 49 | 50 | deser_obj::Nullable{Any} # Deserialized part 51 | 52 | DestinationSerializer(f,p,d) = new(f,p,d) 53 | end 54 | 55 | DestinationSerializer(f::Function, pids::Array) = DestinationSerializer(f, pids, Nullable{Any}()) 56 | 57 | # contructs a DestinationSerializer after verifying that the shape of pids. 58 | function verified_destination_serializer(f::Function, pids::Array, verify_size) 59 | @assert size(pids) == verify_size 60 | return DestinationSerializer(f, pids) 61 | end 62 | 63 | DestinationSerializer(deser_obj::Any) = DestinationSerializer(Nullable{Function}(), Nullable{Array}(), deser_obj) 64 | 65 | function Base.serialize(S::AbstractSerializer, s::DestinationSerializer) 66 | pid = Base.worker_id_from_socket(S.io) 67 | pididx = findfirst(get(s.pids), pid) 68 | Serializer.serialize_type(S, typeof(s)) 69 | serialize(S, get(s.generate)(pididx)) 70 | end 71 | 72 | function Base.deserialize{T<:DestinationSerializer}(S::AbstractSerializer, t::Type{T}) 73 | lpart = deserialize(S) 74 | return DestinationSerializer(lpart) 75 | end 76 | 77 | 78 | function localpart(s::DestinationSerializer) 79 | if !isnull(s.deser_obj) 80 | return get(s.deser_obj) 81 | elseif !isnull(s.generate) && (myid() in get(s.pids)) 82 | # Handle the special case where myid() is part of s.pids. 83 | # In this case serialize/deserialize is not called as the remotecall is executed locally 84 | return get(s.generate)(findfirst(get(s.pids), myid())) 85 | else 86 | throw(ErrorException(string("Invalid state in DestinationSerializer."))) 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /test/spmd.jl: -------------------------------------------------------------------------------- 1 | @everywhere function spmd_test1() 2 | barrier(;tag=:b1) 3 | 4 | if myid() == 1 5 | @assert recvfrom(2) == "Hello from 2" 6 | println("SPMD: Passed send/recv") 7 | elseif myid() == 2 8 | data = "Hello from 2" 9 | sendto(1, data) 10 | end 11 | 12 | stime = rand(1:5) 13 | # println("Sleeping for $stime seconds") 14 | sleep(stime) 15 | barrier(;tag=:b2) 16 | 17 | bcast_val = nothing 18 | if myid() == 1 19 | bcast_val = rand(2) 20 | end 21 | 22 | bcast_val = bcast(bcast_val, 1) 23 | 24 | if myid() == 1 25 | @assert bcast_val == recvfrom(2) 26 | println("SPMD: Passed broadcast") 27 | elseif myid() == 2 28 | sendto(1, bcast_val) 29 | end 30 | 31 | barrier() 32 | 33 | scatter_data = nothing 34 | if myid() == 1 35 | scatter_data = rand(Int8, nprocs()) 36 | end 37 | lp = scatter(scatter_data, 1, tag=1) 38 | 39 | if myid() == 1 40 | @assert scatter_data[2:2] == recvfrom(2) 41 | println("SPMD: Passed scatter 1") 42 | elseif myid() == 2 43 | sendto(1, lp) 44 | end 45 | 46 | scatter_data = nothing 47 | if myid() == 1 48 | scatter_data = rand(Int8, nprocs()*2) 49 | end 50 | lp = scatter(scatter_data, 1, tag=2) 51 | 52 | if myid() == 1 53 | @assert scatter_data[3:4] == recvfrom(2) 54 | println("SPMD: Passed scatter 2") 55 | elseif myid() == 2 56 | sendto(1, lp) 57 | end 58 | 59 | gathered_data = gather(myid(), 1, tag=3) 60 | if myid() == 1 61 | @assert gathered_data == procs() 62 | println("SPMD: Passed gather 1") 63 | end 64 | 65 | gathered_data = gather([myid(), myid()], 1, tag=4) 66 | if myid() == 1 67 | @assert gathered_data == [[p,p] for p in procs()] 68 | println("SPMD: Passed gather 2") 69 | end 70 | end 71 | 72 | spmd(spmd_test1) 73 | 74 | # Test running only on the workers using the spmd function. 75 | 76 | # define the function everywhere 77 | @everywhere function foo_spmd(d_in, d_out, n) 78 | pids=sort(vec(procs(d_in))) 79 | pididx = findfirst(pids, myid()) 80 | mylp = localpart(d_in) 81 | localsum = 0 82 | 83 | # Have each node exchange data with its neighbors 84 | n_pididx = pididx+1 > length(pids) ? 1 : pididx+1 85 | p_pididx = pididx-1 < 1 ? length(pids) : pididx-1 86 | 87 | # println(p_pididx, " p", pids[p_pididx], " ", n_pididx, " p", pids[n_pididx]) 88 | # println(mylp) 89 | 90 | for i in 1:n 91 | sendto(pids[n_pididx], mylp[2]) 92 | sendto(pids[p_pididx], mylp[1]) 93 | 94 | mylp[2] = recvfrom(pids[p_pididx]) 95 | mylp[1] = recvfrom(pids[n_pididx]) 96 | 97 | # println(mylp) 98 | 99 | barrier(;pids=pids) 100 | localsum = localsum + mylp[1] + mylp[2] 101 | end 102 | 103 | # finally store the sum in d_out 104 | d_out[:L] = localsum 105 | end 106 | 107 | # run foo_spmd on all workers, many of them, all concurrently using implictly different contexts. 108 | in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8) 109 | out_arrays = map(x->ddata(), 1:8) 110 | 111 | @sync for i in 1:8 112 | @async spmd(foo_spmd, in_arrays[i], out_arrays[i], nworkers(); pids=workers()) 113 | end 114 | for i in 1:8 115 | @test Any[sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i]) 116 | end 117 | 118 | println("SPMD: Passed testing of spmd function run concurrently") 119 | 120 | # run concurrently with explictly different contexts 121 | 122 | # define the function everywhere 123 | @everywhere function foo_spmd2(d_in, d_out, n) 124 | pids=sort(vec(procs(d_in))) 125 | pididx = findfirst(pids, myid()) 126 | mylp = localpart(d_in) 127 | 128 | # see if we have a value in the local store. 129 | store = context_local_storage() 130 | 131 | localsum = get!(store, :LOCALSUM, 0) 132 | 133 | # Have each node exchange data with its neighbors 134 | n_pididx = pididx+1 > length(pids) ? 1 : pididx+1 135 | p_pididx = pididx-1 < 1 ? length(pids) : pididx-1 136 | 137 | for i in 1:n 138 | sendto(pids[n_pididx], mylp[2]) 139 | sendto(pids[p_pididx], mylp[1]) 140 | 141 | mylp[2] = recvfrom(pids[p_pididx]) 142 | mylp[1] = recvfrom(pids[n_pididx]) 143 | 144 | barrier(;pids=pids) 145 | localsum = localsum + mylp[1] + mylp[2] 146 | end 147 | 148 | # finally store the sum in d_out 149 | d_out[:L] = localsum 150 | store[:LOCALSUM] = localsum 151 | end 152 | 153 | 154 | in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8) 155 | out_arrays = map(x->ddata(), 1:8) 156 | contexts = map(x->context(workers()), 1:8) 157 | 158 | @sync for i in 1:8 159 | @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i]) 160 | end 161 | # Second run will add the value stored in the previous run. 162 | @sync for i in 1:8 163 | @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i]) 164 | end 165 | 166 | for i in 1:8 167 | @test Any[2*sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i]) 168 | end 169 | 170 | # verify localstores with appropriate context store values exist. 171 | @everywhere begin 172 | if myid() != 1 173 | n = 0 174 | for (k,v) in DistributedArrays.SPMD.map_ctxts 175 | store = v.store 176 | localsum = store[:LOCALSUM] 177 | if localsum != 2*sum(workers())*2 178 | println("localsum ", localsum, " != $(2*sum(workers())*2)") 179 | error("localsum mismatch") 180 | end 181 | n += 1 182 | end 183 | @assert n == 8 184 | end 185 | end 186 | 187 | # close the contexts 188 | foreach(x->close(x), contexts) 189 | 190 | # verify that the localstores have been deleted. 191 | @everywhere begin 192 | @assert isempty(DistributedArrays.SPMD.map_ctxts) 193 | end 194 | 195 | println("SPMD: Passed spmd function with explicit context run concurrently") 196 | 197 | -------------------------------------------------------------------------------- /src/sort.jl: -------------------------------------------------------------------------------- 1 | # Sorting a DVector using samplesort 2 | 3 | function sample_n_setup_ref(d::DVector, sample_size; kwargs...) 4 | lp = localpart(d) 5 | llp = length(lp) 6 | np = length(procs(d)) 7 | sample_size = llp > sample_size ? sample_size : llp 8 | sorted = sort(lp; kwargs...) 9 | sample = sorted[collect(1:div(llp,sample_size):llp)] 10 | ref = RemoteChannel(()->Channel(np+1)) # To collect parts to be sorted locally later. 11 | # First element is the locally sorted vector 12 | put!(ref, sorted) 13 | return (sample, ref) 14 | end 15 | 16 | 17 | function scatter_n_sort_localparts{T}(d, myidx, refs::Array{RemoteChannel}, boundaries::Array{T}; by = identity, kwargs...) 18 | if d==nothing 19 | sorted = take!(refs[myidx]) # First entry in the remote channel is sorted localpart 20 | else 21 | sorted = sort(localpart(d); by = by, kwargs...) 22 | end 23 | 24 | # send respective parts to correct workers, iterate over sorted array 25 | p_sorted = 1 26 | for (i,r) in enumerate(refs) 27 | p_till = length(sorted)+1 28 | 29 | # calculate range to send to refs[i] 30 | ctr=1 31 | for x in sorted[p_sorted:end] 32 | if by(x) > by(boundaries[i+1]) 33 | p_till = p_sorted+ctr-1 34 | break 35 | else 36 | ctr += 1 37 | end 38 | end 39 | 40 | if p_till == p_sorted 41 | @async put!(r, Array{T}(0)) 42 | else 43 | v = sorted[p_sorted:p_till-1] 44 | @async put!(r, v) 45 | end 46 | 47 | p_sorted = p_till 48 | end 49 | 50 | # wait to receive all of my parts from all other workers 51 | lp_sorting=T[] 52 | for _ in refs 53 | v = take!(refs[myidx]) 54 | append!(lp_sorting, v) 55 | end 56 | 57 | sorted_ref=RemoteChannel() 58 | put!(sorted_ref, sort!(lp_sorting; by = by, kwargs...)) 59 | return (sorted_ref, length(lp_sorting)) 60 | end 61 | 62 | function compute_boundaries{T}(d::DVector{T}; kwargs...) 63 | pids = procs(d) 64 | np = length(pids) 65 | sample_sz_on_wrkr = 512 66 | 67 | results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids) 68 | 69 | samples = Array{T}(0) 70 | for x in results 71 | append!(samples, x[1]) 72 | end 73 | sort!(samples; kwargs...) 74 | samples[1] = typemin(T) 75 | 76 | refs=RemoteChannel[x[2] for x in results] 77 | 78 | boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]] 79 | push!(boundaries, typemax(T)) 80 | 81 | return (boundaries, refs) 82 | end 83 | 84 | """ 85 | sort(d::DVector; sample=true, kwargs...) -> DVector 86 | 87 | Sorts and returns a new distributed vector. 88 | 89 | The sorted vector may not have the same distribution as the original. 90 | 91 | Keyword argument `sample` can take values: 92 | 93 | - `true`: A sample of max size 512 is first taken from all nodes. This is used to balance the distribution of the sorted array on participating workers. Default is `true`. 94 | 95 | - `false`: No sampling is done. Assumes a uniform distribution between min(d) and max(d) 96 | 97 | - 2-element tuple of the form `(min, max)`: No sampling is done. Assumes a uniform distribution between specified min and max values 98 | 99 | - Array{T}: The passed array is assumed to be a sample of the distribution and is used to balance the sorted distribution. 100 | 101 | Keyword argument `alg` takes the same options `Base.sort` 102 | """ 103 | function Base.sort{T}(d::DVector{T}; sample=true, kwargs...) 104 | pids = procs(d) 105 | np = length(pids) 106 | 107 | # Only `alg` and `sample` are supported as keyword arguments 108 | if length(filter(x->!(x in (:alg, :by)), [x[1] for x in kwargs])) > 0 109 | throw(ArgumentError("Only `alg`, `by` and `sample` are supported as keyword arguments")) 110 | end 111 | 112 | if sample==true 113 | boundaries, refs = compute_boundaries(d; kwargs...) 114 | presorted=true 115 | 116 | elseif sample==false 117 | # Assume an uniform distribution between min and max values 118 | minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids) 119 | min_d = minimum(T[x[1] for x in minmax]) 120 | max_d = maximum(T[x[2] for x in minmax]) 121 | 122 | return sort(d; sample=(min_d,max_d), kwargs...) 123 | 124 | elseif isa(sample, Tuple) 125 | # Assume an uniform distribution between min and max values in the tuple 126 | lb=sample[1] 127 | ub=sample[2] 128 | 129 | @assert lb<=ub 130 | 131 | s = Array{T}(np) 132 | part = abs(ub - lb)/np 133 | (isnan(part) || isinf(part)) && throw(ArgumentError("lower and upper bounds must not be infinities")) 134 | 135 | for n in 1:np 136 | v = lb + (n-1)*part 137 | if T <: Integer 138 | s[n] = round(v) 139 | else 140 | s[n] = v 141 | end 142 | end 143 | return sort(d; sample=s, kwargs...) 144 | 145 | elseif isa(sample, Array) 146 | # Provided array is used as a sample 147 | samples = sort(copy(sample)) 148 | samples[1] = typemin(T) 149 | boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]] 150 | push!(boundaries, typemax(T)) 151 | presorted=false 152 | 153 | refs=RemoteChannel[RemoteChannel(p) for p in procs(d)] 154 | else 155 | throw(ArgumentError("keyword arg `sample` must be Boolean, Tuple(Min,Max) or an actual sample of data : " * string(sample))) 156 | end 157 | 158 | local_sort_results = Array{Tuple}(np) 159 | 160 | Base.asyncmap!((i,p) -> remotecall_fetch( 161 | scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...), 162 | local_sort_results, 1:np, pids) 163 | 164 | # Construct a new DArray from the sorted refs. Remove parts with 0-length since 165 | # the DArray constructor_from_refs does not yet support it. This implies that 166 | # the participating workers for the sorted darray may be different from the original 167 | # for highly non-uniform distributions. 168 | local_sorted_refs = RemoteChannel[x[1] for x in filter(x->x[2]>0, local_sort_results)] 169 | return DArray(local_sorted_refs) 170 | end 171 | -------------------------------------------------------------------------------- /src/spmd.jl: -------------------------------------------------------------------------------- 1 | module SPMD 2 | 3 | import DistributedArrays: gather, next_did, close 4 | import Base.recvfrom # UDP socket 5 | export sendto, recvfrom, recvfrom_any, barrier, bcast, scatter, gather 6 | export context_local_storage, context, spmd, close 7 | 8 | 9 | type WorkerDataChannel 10 | pid::Int 11 | rc::Nullable{RemoteChannel} 12 | lock::ReentrantLock 13 | 14 | WorkerDataChannel(pid) = new(pid, Nullable{RemoteChannel}(), ReentrantLock()) 15 | end 16 | 17 | type SPMDContext 18 | id::Tuple 19 | chnl::Channel 20 | store::Dict{Any,Any} 21 | pids::Array 22 | release::Bool 23 | 24 | function SPMDContext(id) 25 | ctxt = new(id, Channel(typemax(Int)), Dict{Any,Any}(), [], false) 26 | finalizer(ctxt, finalize_ctxt) 27 | ctxt 28 | end 29 | end 30 | 31 | function finalize_ctxt(ctxt::SPMDContext) 32 | ctxt.release && close(ctxt) 33 | end 34 | 35 | function context_local_storage() 36 | ctxt = get_ctxt_from_id(task_local_storage(:SPMD_CTXT)) 37 | ctxt.store 38 | end 39 | 40 | function context(pids=procs()) 41 | global map_ctxts 42 | ctxt = SPMDContext(next_did()) 43 | ctxt.pids = pids 44 | ctxt.release = true 45 | ctxt 46 | end 47 | 48 | # Every worker is associated with its own RemoteChannel 49 | const map_worker_channels = Dict{Int, WorkerDataChannel}() 50 | 51 | # mapping between a context id and context object 52 | const map_ctxts = Dict{Tuple, SPMDContext}() 53 | 54 | # Multiple SPMD blocks can be executed concurrently, 55 | # each in its own context. Messages are still sent as part of the 56 | # same remote channels associated with each worker. They are 57 | # read from the remote channel into local channels each associated 58 | # with a different run of `spmd`. 59 | 60 | function get_dc(wc::WorkerDataChannel) 61 | lock(wc.lock) 62 | try 63 | if isnull(wc.rc) 64 | if wc.pid == myid() 65 | myrc = RemoteChannel(()->Channel(typemax(Int))) 66 | wc.rc = Nullable{RemoteChannel}(myrc) 67 | 68 | # start a task to transfer incoming messages into local 69 | # channels based on the execution context 70 | @schedule begin 71 | while true 72 | msg = take!(myrc) 73 | ctxt_id = msg[1] # First element of the message tuple is the context id. 74 | ctxt = get_ctxt_from_id(ctxt_id) 75 | put!(ctxt.chnl, msg[2:end]) # stripping the context_id 76 | end 77 | end 78 | else 79 | wc.rc = Nullable{RemoteChannel}(remotecall_fetch(()->get_remote_dc(myid()), wc.pid)) 80 | end 81 | end 82 | finally 83 | unlock(wc.lock) 84 | end 85 | return get(wc.rc) 86 | end 87 | 88 | function get_ctxt_from_id(ctxt_id) 89 | global map_ctxts 90 | ctxt = get(map_ctxts, ctxt_id, nothing) 91 | if ctxt == nothing 92 | ctxt = SPMDContext(ctxt_id) 93 | map_ctxts[ctxt_id] = ctxt 94 | end 95 | return ctxt 96 | end 97 | 98 | 99 | # Since modules may be loaded in any order on the workers, 100 | # and workers may be dynamically added, pull in the remote channel 101 | # handles when accessed for the first time. 102 | function get_remote_dc(pid) 103 | global map_worker_channels 104 | if !haskey(map_worker_channels, pid) 105 | map_worker_channels[pid] = WorkerDataChannel(pid) 106 | end 107 | 108 | return get_dc(map_worker_channels[pid]) 109 | end 110 | 111 | function send_msg(to, typ, data, tag) 112 | ctxt_id = task_local_storage(:SPMD_CTXT) 113 | @async begin 114 | dc = get_remote_dc(to) 115 | put!(dc, (ctxt_id, typ, myid(), data, tag)) 116 | # println("Sent to ", dc) 117 | end 118 | end 119 | 120 | function get_msg(typ_check, from_check=false, tag_check=nothing) 121 | ctxt_id = task_local_storage(:SPMD_CTXT) 122 | chnl = get_ctxt_from_id(ctxt_id).chnl 123 | 124 | unexpected_msgs=[] 125 | while true 126 | typ, from, data, tag = take!(chnl) 127 | 128 | if (from_check != false && from_check != from) || (typ != typ_check) || (tag != tag_check) 129 | push!(unexpected_msgs, (typ, from, data, tag)) 130 | # println("Unexpected in get_msg ", unexpected_msgs, " looking for ", typ_check, " ", from_check, " ", tag_check) 131 | else 132 | # put all the messages we read (but not expected) back to the local channel 133 | foreach(x->put!(chnl, x), unexpected_msgs) 134 | return (from, data) 135 | end 136 | end 137 | end 138 | 139 | function sendto(pid::Int, data::Any; tag=nothing) 140 | send_msg(pid, :sendto, data, tag) 141 | end 142 | 143 | function recvfrom(pid::Int; tag=nothing) 144 | _, data = get_msg(:sendto, pid, tag) 145 | return data 146 | end 147 | 148 | function recvfrom_any(; tag=nothing) 149 | from, data = get_msg(:sendto, false, tag) 150 | return (from,data) 151 | end 152 | 153 | function barrier(;pids=procs(), tag=nothing) 154 | # send a message to everyone 155 | for p in sort(pids) 156 | send_msg(p, :barrier, nothing, tag) 157 | end 158 | # make sure we recv a message from everyone 159 | pending=deepcopy(pids) 160 | unexpected_msgs=[] 161 | 162 | while length(pending) > 0 163 | from, _ = get_msg(:barrier, false, tag) 164 | if from in pending 165 | filter!(x->x!=from, pending) 166 | else 167 | # handle case of 2 (or more) consecutive barrier calls. 168 | push!(unexpected_msgs, (:barrier, from, nothing, tag)) 169 | # println("Unexpected ", from) 170 | end 171 | # length(pending) == 1 && println("Waiting for ", pending) 172 | end 173 | 174 | ctxt_id = task_local_storage(:SPMD_CTXT) 175 | chnl = get_ctxt_from_id(ctxt_id).chnl 176 | foreach(x->put!(chnl, x), unexpected_msgs) 177 | return nothing 178 | end 179 | 180 | function bcast(data::Any, pid::Int; tag=nothing, pids=procs()) 181 | if myid() == pid 182 | for p in filter(x->x!=pid, sort(pids)) 183 | send_msg(p, :bcast, data, tag) 184 | end 185 | return data 186 | else 187 | from, data = get_msg(:bcast, pid, tag) 188 | return data 189 | end 190 | end 191 | 192 | function scatter(x, pid::Int; tag=nothing, pids=procs()) 193 | if myid() == pid 194 | @assert rem(length(x), length(pids)) == 0 195 | cnt = div(length(x), length(pids)) 196 | for (i,p) in enumerate(sort(pids)) 197 | p == pid && continue 198 | send_msg(p, :scatter, x[cnt*(i-1)+1:cnt*i], tag) 199 | end 200 | myidx = findfirst(sort(pids), pid) 201 | return x[cnt*(myidx-1)+1:cnt*myidx] 202 | else 203 | _, data = get_msg(:scatter, pid, tag) 204 | return data 205 | end 206 | end 207 | 208 | function gather(x, pid::Int; tag=nothing, pids=procs()) 209 | if myid() == pid 210 | gathered_data = Array{Any}(length(pids)) 211 | myidx = findfirst(sort(pids), pid) 212 | gathered_data[myidx] = x 213 | n = length(pids) - 1 214 | while n > 0 215 | from, data_x = get_msg(:gather, false, tag) 216 | fromidx = findfirst(sort(pids), from) 217 | gathered_data[fromidx] = data_x 218 | n=n-1 219 | end 220 | return gathered_data 221 | else 222 | send_msg(pid, :gather, x, tag) 223 | return x 224 | end 225 | end 226 | 227 | function spmd_local(f, ctxt_id, clear_ctxt) 228 | task_local_storage(:SPMD_CTXT, ctxt_id) 229 | f() 230 | clear_ctxt && delete_ctxt_id(ctxt_id) 231 | return nothing 232 | end 233 | 234 | function spmd(f, args...; pids=procs(), context=nothing) 235 | f_noarg = ()->f(args...) 236 | clear_ctxt = false 237 | if context == nothing 238 | ctxt_id = next_did() 239 | clear_ctxt = true # temporary unique context created for this run. 240 | # should be cleared at the end of the run. 241 | else 242 | ctxt_id = context.id 243 | end 244 | @sync for p in pids 245 | @async remotecall_fetch(spmd_local, p, f_noarg, ctxt_id, clear_ctxt) 246 | end 247 | nothing 248 | end 249 | 250 | function delete_ctxt_id(ctxt_id) 251 | global map_ctxts 252 | haskey(map_ctxts, ctxt_id) && delete!(map_ctxts, ctxt_id) 253 | nothing 254 | end 255 | 256 | function close(ctxt::SPMDContext) 257 | for p in ctxt.pids 258 | Base.remote_do(delete_ctxt_id, p, ctxt.id) 259 | end 260 | ctxt.release = false 261 | end 262 | 263 | end -------------------------------------------------------------------------------- /src/linalg.jl: -------------------------------------------------------------------------------- 1 | function Base.ctranspose{T}(D::DArray{T,2}) 2 | DArray(reverse(size(D)), procs(D)) do I 3 | lp = Array{T}(map(length, I)) 4 | rp = convert(Array, D[reverse(I)...]) 5 | ctranspose!(lp, rp) 6 | end 7 | end 8 | 9 | function Base.transpose{T}(D::DArray{T,2}) 10 | DArray(reverse(size(D)), procs(D)) do I 11 | lp = Array{T}(map(length, I)) 12 | rp = convert(Array, D[reverse(I)...]) 13 | transpose!(lp, rp) 14 | end 15 | end 16 | 17 | typealias DVector{T,A} DArray{T,1,A} 18 | typealias DMatrix{T,A} DArray{T,2,A} 19 | 20 | # Level 1 21 | 22 | function axpy!(α, x::DVector, y::DVector) 23 | if length(x) != length(y) 24 | throw(DimensionMismatch("vectors must have same length")) 25 | end 26 | @sync for p in procs(y) 27 | @async remotecall_fetch(() -> (Base.axpy!(α, localpart(x), localpart(y)); nothing), p) 28 | end 29 | return y 30 | end 31 | 32 | function dot(x::DVector, y::DVector) 33 | if length(x) != length(y) 34 | throw(DimensionMismatch("")) 35 | end 36 | if (procs(x) != procs(y)) || (x.cuts != y.cuts) 37 | throw(ArgumentError("vectors don't have the same distribution. Not handled for efficiency reasons.")) 38 | end 39 | 40 | results=Any[] 41 | @sync begin 42 | for i = eachindex(x.pids) 43 | @async push!(results, remotecall_fetch((x, y, i) -> dot(localpart(x), fetch(y, i)), x.pids[i], x, y, i)) 44 | end 45 | end 46 | return reduce(+, results) 47 | end 48 | 49 | function norm(x::DVector, p::Real = 2) 50 | results = [] 51 | @sync begin 52 | for pp in procs(x) 53 | @async push!(results, remotecall_fetch(() -> norm(localpart(x), p), pp)) 54 | end 55 | end 56 | return norm(results, p) 57 | end 58 | 59 | Base.scale!(A::DArray, x::Number) = begin 60 | @sync for p in procs(A) 61 | @async remotecall_fetch((A,x)->(scale!(localpart(A), x); nothing), p, A, x) 62 | end 63 | return A 64 | end 65 | 66 | # Level 2 67 | function add!(dest, src, scale = one(dest[1])) 68 | if length(dest) != length(src) 69 | throw(DimensionMismatch("source and destination arrays must have same number of elements")) 70 | end 71 | if scale == one(scale) 72 | @simd for i = eachindex(dest) 73 | @inbounds dest[i] += src[i] 74 | end 75 | else 76 | @simd for i = eachindex(dest) 77 | @inbounds dest[i] += scale*src[i] 78 | end 79 | end 80 | return dest 81 | end 82 | 83 | function A_mul_B!(α::Number, A::DMatrix, x::AbstractVector, β::Number, y::DVector) 84 | 85 | # error checks 86 | if size(A, 2) != length(x) 87 | throw(DimensionMismatch("")) 88 | end 89 | if y.cuts[1] != A.cuts[1] 90 | throw(ArgumentError("cuts of output vector must match cuts of first dimension of matrix")) 91 | end 92 | 93 | # Multiply on each tile of A 94 | R = Array{Future}(size(A.pids)...) 95 | for j = 1:size(A.pids, 2) 96 | xj = x[A.cuts[2][j]:A.cuts[2][j + 1] - 1] 97 | for i = 1:size(A.pids, 1) 98 | R[i,j] = remotecall(procs(A)[i,j]) do 99 | localpart(A)*convert(localtype(x), xj) 100 | end 101 | end 102 | end 103 | 104 | # Scale y if necessary 105 | if β != one(β) 106 | @sync for p in y.pids 107 | if β != zero(β) 108 | @async remotecall_fetch(y -> (scale!(localpart(y), β); nothing), p, y) 109 | else 110 | @async remotecall_fetch(y -> (fill!(localpart(y), 0); nothing), p, y) 111 | end 112 | end 113 | end 114 | 115 | # Update y 116 | @sync for i = 1:size(R, 1) 117 | p = y.pids[i] 118 | for j = 1:size(R, 2) 119 | rij = R[i,j] 120 | @async remotecall_fetch(() -> (add!(localpart(y), fetch(rij), α); nothing), p) 121 | end 122 | end 123 | 124 | return y 125 | end 126 | 127 | function Ac_mul_B!(α::Number, A::DMatrix, x::AbstractVector, β::Number, y::DVector) 128 | 129 | # error checks 130 | if size(A, 1) != length(x) 131 | throw(DimensionMismatch("")) 132 | end 133 | if y.cuts[1] != A.cuts[2] 134 | throw(ArgumentError("cuts of output vector must match cuts of second dimension of matrix")) 135 | end 136 | 137 | # Multiply on each tile of A 138 | R = Array{Future}(reverse(size(A.pids))...) 139 | for j = 1:size(A.pids, 1) 140 | xj = x[A.cuts[1][j]:A.cuts[1][j + 1] - 1] 141 | for i = 1:size(A.pids, 2) 142 | R[i,j] = remotecall(() -> localpart(A)'*convert(localtype(x), xj), procs(A)[j,i]) 143 | end 144 | end 145 | 146 | # Scale y if necessary 147 | if β != one(β) 148 | @sync for p in y.pids 149 | if β != zero(β) 150 | @async remotecall_fetch(() -> (scale!(localpart(y), β); nothing), p) 151 | else 152 | @async remotecall_fetch(() -> (fill!(localpart(y), 0); nothing), p) 153 | end 154 | end 155 | end 156 | 157 | # Update y 158 | @sync for i = 1:size(R, 1) 159 | p = y.pids[i] 160 | for j = 1:size(R, 2) 161 | rij = R[i,j] 162 | @async remotecall_fetch(() -> (add!(localpart(y), fetch(rij), α); nothing), p) 163 | end 164 | end 165 | return y 166 | end 167 | 168 | function Base.LinAlg.scale!(b::AbstractVector, DA::DMatrix) 169 | s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx 170 | b[DA.indexes[pididx][1]] 171 | end 172 | map_localparts!(DA) do lDA 173 | scale!(localpart(s), lDA) 174 | end 175 | end 176 | 177 | function Base.LinAlg.scale!(DA::DMatrix, b::AbstractVector) 178 | s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx 179 | b[DA.indexes[pididx][2]] 180 | end 181 | map_localparts!(DA) do lDA 182 | scale!(lDA, localpart(s)) 183 | end 184 | end 185 | 186 | # Level 3 187 | function _matmatmul!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix, tA) 188 | # error checks 189 | Ad1, Ad2 = (tA == 'N') ? (1,2) : (2,1) 190 | mA, nA = size(A, Ad1, Ad2) 191 | mB, nB = size(B) 192 | if mB != nA 193 | throw(DimensionMismatch("matrix A has dimensions ($mA, $nA), matrix B has dimensions ($mB, $nB)")) 194 | end 195 | if size(C,1) != mA || size(C,2) != nB 196 | throw(DimensionMismatch("result C has dimensions $(size(C)), needs ($mA, $nB)")) 197 | end 198 | if C.cuts[1] != A.cuts[Ad1] 199 | throw(ArgumentError("cuts of the first dimension of the output matrix must match cuts of dimension $Ad1 of the first input matrix")) 200 | end 201 | 202 | # Multiply on each tile of A 203 | if tA == 'N' 204 | R = Array{Future}(size(procs(A))..., size(procs(C), 2)) 205 | else 206 | R = Array{Future}(reverse(size(procs(A)))..., size(procs(C), 2)) 207 | end 208 | for j = 1:size(A.pids, Ad2) 209 | for k = 1:size(C.pids, 2) 210 | Acuts = A.cuts[Ad2] 211 | Ccuts = C.cuts[2] 212 | Bjk = B[Acuts[j]:Acuts[j + 1] - 1, Ccuts[k]:Ccuts[k + 1] - 1] 213 | for i = 1:size(A.pids, Ad1) 214 | p = (tA == 'N') ? procs(A)[i,j] : procs(A)[j,i] 215 | R[i,j,k] = remotecall(p) do 216 | if tA == 'T' 217 | return localpart(A).'*convert(localtype(B), Bjk) 218 | elseif tA == 'C' 219 | return localpart(A)'*convert(localtype(B), Bjk) 220 | else 221 | return localpart(A)*convert(localtype(B), Bjk) 222 | end 223 | end 224 | end 225 | end 226 | end 227 | 228 | # Scale C if necessary 229 | if β != one(β) 230 | @sync for p in C.pids 231 | if β != zero(β) 232 | @async remotecall_fetch(() -> (scale!(localpart(C), β); nothing), p) 233 | else 234 | @async remotecall_fetch(() -> (fill!(localpart(C), 0); nothing), p) 235 | end 236 | end 237 | end 238 | 239 | # Update C 240 | @sync for i = 1:size(R, 1) 241 | for k = 1:size(C.pids, 2) 242 | p = C.pids[i,k] 243 | for j = 1:size(R, 2) 244 | rijk = R[i,j,k] 245 | @async remotecall_fetch(d -> (add!(localpart(d), fetch(rijk), α); nothing), p, C) 246 | end 247 | end 248 | end 249 | return C 250 | end 251 | 252 | A_mul_B!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix) = _matmatmul!(α, A, B, β, C, 'N') 253 | Ac_mul_B!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix) = _matmatmul!(α, A, B, β, C, 'C') 254 | At_mul_B!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix) = _matmatmul!(α, A, B, β, C, 'T') 255 | At_mul_B!(C::DMatrix, A::DMatrix, B::AbstractMatrix) = At_mul_B!(one(eltype(C)), A, B, zero(eltype(C)), C) 256 | 257 | _matmul_op = (t,s) -> t*s + t*s 258 | 259 | function (*)(A::DMatrix, x::AbstractVector) 260 | T = Base.promote_op(_matmul_op, eltype(A), eltype(x)) 261 | y = DArray(I -> Array{T}(map(length, I)), (size(A, 1),), procs(A)[:,1], (size(procs(A), 1),)) 262 | return A_mul_B!(one(T), A, x, zero(T), y) 263 | end 264 | function (*)(A::DMatrix, B::AbstractMatrix) 265 | T = Base.promote_op(_matmul_op, eltype(A), eltype(B)) 266 | C = DArray(I -> Array{T}(map(length, I)), 267 | (size(A, 1), size(B, 2)), 268 | procs(A)[:,1:min(size(procs(A), 2), size(procs(B), 2))], 269 | (size(procs(A), 1), min(size(procs(A), 2), size(procs(B), 2)))) 270 | return A_mul_B!(one(T), A, B, zero(T), C) 271 | end 272 | 273 | function Ac_mul_B(A::DMatrix, x::AbstractVector) 274 | T = Base.promote_op(_matmul_op, eltype(A), eltype(x)) 275 | y = DArray(I -> Array{T}(map(length, I)), 276 | (size(A, 2),), 277 | procs(A)[1,:], 278 | (size(procs(A), 2),)) 279 | return Ac_mul_B!(one(T), A, x, zero(T), y) 280 | end 281 | function Ac_mul_B(A::DMatrix, B::AbstractMatrix) 282 | T = Base.promote_op(_matmul_op, eltype(A), eltype(B)) 283 | C = DArray(I -> Array{T}(map(length, I)), (size(A, 2), 284 | size(B, 2)), 285 | procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:], 286 | (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2)))) 287 | return Ac_mul_B!(one(T), A, B, zero(T), C) 288 | end 289 | -------------------------------------------------------------------------------- /src/mapreduce.jl: -------------------------------------------------------------------------------- 1 | ## higher-order functions ## 2 | 3 | Base.map(f, d::DArray) = DArray(I->map(f, localpart(d)), d) 4 | 5 | Base.map!{F}(f::F, dest::DArray, src::DArray) = begin 6 | @sync for p in procs(dest) 7 | @async remotecall_fetch(() -> (map!(f, localpart(dest), src[localindexes(dest)...]); nothing), p) 8 | end 9 | return dest 10 | end 11 | 12 | Base.Broadcast._containertype{D<:DArray}(::Type{D}) = DArray 13 | 14 | Base.Broadcast.promote_containertype(::Type{DArray}, ::Type{DArray}) = DArray 15 | Base.Broadcast.promote_containertype(::Type{DArray}, ::Type{Array}) = DArray 16 | Base.Broadcast.promote_containertype(::Type{DArray}, ct) = DArray 17 | Base.Broadcast.promote_containertype(::Type{Array}, ::Type{DArray}) = DArray 18 | Base.Broadcast.promote_containertype(ct, ::Type{DArray}) = DArray 19 | 20 | Base.Broadcast.broadcast_indices(::Type{DArray}, A) = indices(A) 21 | Base.Broadcast.broadcast_indices(::Type{DArray}, A::Ref) = () 22 | 23 | # FixMe! 24 | ## 1. Support for arbitrary indices including OneTo 25 | ## 2. This is as type unstable as it can be. Overhead might not matter too much for DArrays though. 26 | function Base.Broadcast.broadcast_c(f, ::Type{DArray}, As...) 27 | T = Base.Broadcast._broadcast_eltype(f, As...) 28 | shape = Base.Broadcast.broadcast_indices(As...) 29 | iter = Base.CartesianRange(shape) 30 | D = DArray(map(length, shape)) do I 31 | Base.Broadcast.broadcast_c(f, Array, 32 | map(a -> isa(a, Union{Number,Ref}) ? a : 33 | localtype(a)(a[ntuple(i -> i > ndims(a) ? 1 : (size(a, i) == 1 ? (1:1) : I[i]), length(shape))...]), As)...) 34 | end 35 | return D 36 | end 37 | 38 | function Base.reduce(f, d::DArray) 39 | results=[] 40 | @sync begin 41 | for p in procs(d) 42 | @async push!(results, remotecall_fetch((f,d)->reduce(f, localpart(d)), p, f, d)) 43 | end 44 | end 45 | reduce(f, results) 46 | end 47 | 48 | function _mapreduce(f, opt, d::DArray) 49 | # TODO Change to an @async remotecall_fetch - will reduce one extra network hop - 50 | # once bug in master is fixed. 51 | results=[] 52 | @sync begin 53 | for p in procs(d) 54 | @async push!(results, remotecall_fetch((f,opt,d)->mapreduce(f, opt, localpart(d)), p, f, opt, d)) 55 | end 56 | end 57 | reduce(opt, results) 58 | end 59 | Base.mapreduce(f, opt::Union{typeof(|), typeof(&)}, d::DArray) = _mapreduce(f, opt, d) 60 | Base.mapreduce(f, opt::Function, d::DArray) = _mapreduce(f, opt, d) 61 | Base.mapreduce(f, opt, d::DArray) = _mapreduce(f, opt, d) 62 | 63 | # mapreducedim 64 | Base.reducedim_initarray{R}(A::DArray, region, v0, ::Type{R}) = begin 65 | # Store reduction on lowest pids 66 | pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...] 67 | chunks = similar(pids, Future) 68 | @sync for i in eachindex(pids) 69 | @async chunks[i...] = remotecall_wait(() -> Base.reducedim_initarray(localpart(A), region, v0, R), pids[i...]) 70 | end 71 | return DArray(chunks) 72 | end 73 | Base.reducedim_initarray{T}(A::DArray, region, v0::T) = Base.reducedim_initarray(A, region, v0, T) 74 | 75 | Base.reducedim_initarray0{R}(A::DArray, region, v0, ::Type{R}) = begin 76 | # Store reduction on lowest pids 77 | pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...] 78 | chunks = similar(pids, Future) 79 | @sync for i in eachindex(pids) 80 | @async chunks[i...] = remotecall_wait(() -> Base.reducedim_initarray0(localpart(A), region, v0, R), pids[i...]) 81 | end 82 | return DArray(chunks) 83 | end 84 | Base.reducedim_initarray0{T}(A::DArray, region, v0::T) = Base.reducedim_initarray0(A, region, v0, T) 85 | 86 | # Compute mapreducedim of each localpart and store the result in a new DArray 87 | mapreducedim_within(f, op, A::DArray, region) = begin 88 | arraysize = [size(A)...] 89 | gridsize = [size(A.indexes)...] 90 | arraysize[[region...]] = gridsize[[region...]] 91 | indx = similar(A.indexes) 92 | for i in CartesianRange(size(indx)) 93 | indx[i] = ntuple(j -> j in region ? (i.I[j]:i.I[j]) : A.indexes[i][j], ndims(A)) 94 | end 95 | cuts = [i in region ? collect(1:arraysize[i] + 1) : A.cuts[i] for i in 1:ndims(A)] 96 | return DArray(next_did(), I -> mapreducedim(f, op, localpart(A), region), 97 | tuple(arraysize...), procs(A), indx, cuts) 98 | end 99 | 100 | # Compute mapreducedim accros the processes. This should be done after mapreducedim 101 | # has been run on each localpart with mapreducedim_within. Eventually, we might 102 | # want to write mapreducedim_between! as a binary reduction. 103 | function mapreducedim_between!(f, op, R::DArray, A::DArray, region) 104 | @sync for p in procs(R) 105 | @async remotecall_fetch(p, f, op, R, A, region) do f, op, R, A, region 106 | localind = [r for r = localindexes(A)] 107 | localind[[region...]] = [1:n for n = size(A)[[region...]]] 108 | B = convert(Array, A[localind...]) 109 | Base.mapreducedim!(f, op, localpart(R), B) 110 | nothing 111 | end 112 | end 113 | return R 114 | end 115 | 116 | Base.mapreducedim!(f, op, R::DArray, A::DArray) = begin 117 | lsize = Base.check_reducedims(R,A) 118 | if isempty(A) 119 | return copy(R) 120 | end 121 | region = tuple(collect(1:ndims(A))[[size(R)...] .!= [size(A)...]]...) 122 | if isempty(region) 123 | return copy!(R, A) 124 | end 125 | B = mapreducedim_within(f, op, A, region) 126 | return mapreducedim_between!(identity, op, R, B, region) 127 | end 128 | 129 | Base.mapreducedim(f, op, R::DArray, A::DArray) = begin 130 | Base.mapreducedim!(f, op, Base.reducedim_initarray(A, region, v0), A) 131 | end 132 | 133 | function nnz(A::DArray) 134 | B = Array{Any}(size(A.pids)) 135 | @sync begin 136 | for i in eachindex(A.pids) 137 | @async B[i...] = remotecall_fetch(x -> nnz(localpart(x)), A.pids[i...], A) 138 | end 139 | end 140 | return reduce(+, B) 141 | end 142 | 143 | # reduce like 144 | for (fn, fr) in ((:sum, :+), 145 | (:prod, :*), 146 | (:maximum, :max), 147 | (:minimum, :min), 148 | (:any, :|), 149 | (:all, :&)) 150 | @eval (Base.$fn)(d::DArray) = reduce($fr, d) 151 | end 152 | 153 | # mapreduce like 154 | for (fn, fr1, fr2) in ((:maxabs, :abs, :max), 155 | (:minabs, :abs, :min), 156 | (:sumabs, :abs, :+), 157 | (:sumabs2, :abs2, :+)) 158 | @eval (Base.$fn)(d::DArray) = mapreduce($fr1, $fr2, d) 159 | end 160 | 161 | # semi mapreduce 162 | for (fn, fr) in ((:any, :|), 163 | (:all, :&), 164 | (:count, :+)) 165 | @eval begin 166 | (Base.$fn)(f::typeof(identity), d::DArray) = mapreduce(f, $fr, d) 167 | (Base.$fn)(f::Callable, d::DArray) = mapreduce(f, $fr, d) 168 | end 169 | end 170 | 171 | # Unary vector functions 172 | (-)(D::DArray) = map(-, D) 173 | 174 | @static if VERSION < v"0.6.0-dev.1731" 175 | # scalar ops 176 | (+)(A::DArray{Bool}, x::Bool) = A .+ x 177 | (+)(x::Bool, A::DArray{Bool}) = x .+ A 178 | (-)(A::DArray{Bool}, x::Bool) = A .- x 179 | (-)(x::Bool, A::DArray{Bool}) = x .- A 180 | (+)(A::DArray, x::Number) = A .+ x 181 | (+)(x::Number, A::DArray) = x .+ A 182 | (-)(A::DArray, x::Number) = A .- x 183 | (-)(x::Number, A::DArray) = x .- A 184 | end 185 | 186 | map_localparts(f::Callable, d::DArray) = DArray(i->f(localpart(d)), d) 187 | map_localparts(f::Callable, d1::DArray, d2::DArray) = DArray(d1) do I 188 | f(localpart(d1), localpart(d2)) 189 | end 190 | 191 | function map_localparts(f::Callable, DA::DArray, A::Array) 192 | s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx 193 | A[DA.indexes[pididx]...] 194 | end 195 | DArray(DA) do I 196 | f(localpart(DA), localpart(s)) 197 | end 198 | end 199 | 200 | function map_localparts(f::Callable, A::Array, DA::DArray) 201 | s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx 202 | A[DA.indexes[pididx]...] 203 | end 204 | DArray(DA) do I 205 | f(localpart(s), localpart(DA)) 206 | end 207 | end 208 | 209 | function map_localparts!(f::Callable, d::DArray) 210 | @sync for p in procs(d) 211 | @async remotecall_fetch((f,d)->(f(localpart(d)); nothing), p, f, d) 212 | end 213 | return d 214 | end 215 | 216 | # Here we assume all the DArrays have 217 | # the same size and distribution 218 | map_localparts(f::Callable, As::DArray...) = DArray(I->f(map(localpart, As)...), As[1]) 219 | 220 | @static if VERSION < v"0.6.0-dev.1632" 221 | for f in (:.+, :.-, :.*, :./, :.%, :.<<, :.>>, :div, :mod, :rem, :&, :|, :$) 222 | @eval begin 223 | ($f){T}(A::DArray{T}, B::Number) = map_localparts(r->($f)(r, B), A) 224 | ($f){T}(A::Number, B::DArray{T}) = map_localparts(r->($f)(A, r), B) 225 | end 226 | end 227 | end 228 | 229 | function samedist(A::DArray, B::DArray) 230 | (size(A) == size(B)) || throw(DimensionMismatch()) 231 | if (procs(A) != procs(B)) || (A.cuts != B.cuts) 232 | B = DArray(x->B[x...], A) 233 | end 234 | B 235 | end 236 | 237 | for f in (:+, :-, :div, :mod, :rem, :&, :|, :$) 238 | @eval begin 239 | function ($f){T}(A::DArray{T}, B::DArray{T}) 240 | B = samedist(A, B) 241 | map_localparts($f, A, B) 242 | end 243 | ($f){T}(A::DArray{T}, B::Array{T}) = map_localparts($f, A, B) 244 | ($f){T}(A::Array{T}, B::DArray{T}) = map_localparts($f, A, B) 245 | end 246 | end 247 | @static if VERSION < v"0.6.0-dev.1632" 248 | for f in (:.+, :.-, :.*, :./, :.%, :.<<, :.>>) 249 | @eval begin 250 | function ($f){T}(A::DArray{T}, B::DArray{T}) 251 | map_localparts($f, A, B) 252 | end 253 | ($f){T}(A::DArray{T}, B::Array{T}) = map_localparts($f, A, B) 254 | ($f){T}(A::Array{T}, B::DArray{T}) = map_localparts($f, A, B) 255 | end 256 | end 257 | end 258 | 259 | function mapslices{T,N,A}(f::Function, D::DArray{T,N,A}, dims::AbstractVector) 260 | if !all(t -> t == 1, size(D.indexes)[dims]) 261 | p = ones(Int, ndims(D)) 262 | nondims = filter(t -> !(t in dims), 1:ndims(D)) 263 | p[nondims] = defaultdist([size(D)...][[nondims...]], procs(D)) 264 | DD = DArray(size(D), procs(D), p) do I 265 | return convert(A, D[I...]) 266 | end 267 | return mapslices(f, DD, dims) 268 | end 269 | 270 | refs = Future[remotecall((x,y,z)->mapslices(x,localpart(y),z), p, f, D, dims) for p in procs(D)] 271 | 272 | DArray(reshape(refs, size(procs(D)))) 273 | end 274 | 275 | function _ppeval(f, A...; dim = map(ndims, A)) 276 | if length(dim) != length(A) 277 | throw(ArgumentError("dim argument has wrong length. length(dim) = $(length(dim)) but should be $(length(A))")) 278 | end 279 | narg = length(A) 280 | dimlength = size(A[1], dim[1]) 281 | for i = 2:narg 282 | if dim[i] > 0 && dimlength != size(A[i], dim[i]) 283 | throw(ArgumentError("lengths of broadcast dimensions must be the same. size(A[1], $(dim[1])) = $dimlength but size(A[$i], $(dim[i])) = $(size(A[i], dim[i]))")) 284 | end 285 | end 286 | dims = [] 287 | idx = [] 288 | args = [] 289 | for i = 1:narg 290 | push!(dims, ndims(A[i])) 291 | push!(idx, Any[1:size(A[i], d) for d in 1:dims[i]]) 292 | if dim[i] > 0 293 | idx[i][dim[i]] = 1 294 | push!(args, view(A[i], idx[i]...)) 295 | else 296 | push!(args, A[i]) 297 | end 298 | end 299 | R1 = f(args...) 300 | ridx = Any[1:size(R1, d) for d in 1:ndims(R1)] 301 | push!(ridx, 1) 302 | Rsize = map(last, ridx) 303 | Rsize[end] = dimlength 304 | R = Array{eltype(R1)}(Rsize...) 305 | 306 | for i = 1:dimlength 307 | for j = 1:narg 308 | if dim[j] > 0 309 | idx[j][dim[j]] = i 310 | args[j] = view(A[j], idx[j]...) 311 | else 312 | args[j] = A[j] 313 | end 314 | end 315 | ridx[end] = i 316 | R[ridx...] = f(args...) 317 | end 318 | 319 | return R 320 | end 321 | 322 | """ 323 | ppeval(f, D...; dim::NTuple) 324 | 325 | Evaluates the callable argument `f` on slices of the elements of the `D` tuple. 326 | 327 | #### Arguments 328 | `f` can be any callable object that accepts sliced or broadcasted elements of `D`. 329 | The result returned from `f` must be either an array or a scalar. 330 | 331 | `D` has any number of elements and the alements can have any type. If an element 332 | of `D` is a distributed array along the dimension specified by `dim`. If an 333 | element of `D` is not distributed, the element is by default broadcasted and 334 | applied on all evaluations of `f`. 335 | 336 | `dim` is a tuple of integers specifying the dimension over which the elements 337 | of `D` is slices. The length of the tuple must therefore be the same as the 338 | number of arguments `D`. By default distributed arrays are slides along the 339 | last dimension. If the value is less than or equal to zero the element are 340 | broadcasted to all evaluations of `f`. 341 | 342 | #### Result 343 | `ppeval` returns a distributed array of dimension `p+1` where the first `p` 344 | sizes correspond to the sizes of return values of `f`. The last dimention of 345 | the return array from `ppeval` has the same length as the dimension over which 346 | the input arrays are sliced. 347 | 348 | #### Examples 349 | ```jl 350 | addprocs(JULIA_CPU_CORES) 351 | 352 | using DistributedArrays 353 | 354 | A = drandn((10, 10, JULIA_CPU_CORES), workers(), [1, 1, JULIA_CPU_CORES]) 355 | 356 | ppeval(eigvals, A) 357 | 358 | ppeval(eigvals, A, randn(10,10)) # broadcasting second argument 359 | 360 | B = drandn((10, JULIA_CPU_CORES), workers(), [1, JULIA_CPU_CORES]) 361 | 362 | ppeval(*, A, B) 363 | ``` 364 | """ 365 | function ppeval(f, D...; dim::NTuple = map(t -> isa(t, DArray) ? ndims(t) : 0, D)) 366 | #Ensure that the complete DArray is available on the specified dims on all processors 367 | for i = 1:length(D) 368 | if isa(D[i], DArray) 369 | for idxs in D[i].indexes 370 | for d in setdiff(1:ndims(D[i]), dim[i]) 371 | if length(idxs[d]) != size(D[i], d) 372 | throw(DimensionMismatch(string("dimension $d is distributed. ", 373 | "ppeval requires dimension $d to be completely available on all processors."))) 374 | end 375 | end 376 | end 377 | end 378 | end 379 | 380 | refs = Future[remotecall((x, y, z) -> _ppeval(x, map(localpart, y)...; dim = z), p, f, D, dim) for p in procs(D[1])] 381 | 382 | # The array of Futures has to be reshaped for the DArray constructor to work correctly. 383 | # This requires a fetch and the DArray is also fetching so it might be better to modify 384 | # the DArray constructor. 385 | sd = [size(D[1].pids)...] 386 | nd = remotecall_fetch((r)->ndims(fetch(r)), refs[1].where, refs[1]) 387 | DArray(reshape(refs, tuple([sd[1:nd - 1], sd[end];]...))) 388 | end 389 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DistributedArrays.jl 2 | 3 | [![Build Status](https://travis-ci.org/JuliaParallel/DistributedArrays.jl.svg?branch=master)](https://travis-ci.org/JuliaParallel/DistributedArrays.jl) 4 | [![Coverage Status](https://coveralls.io/repos/github/JuliaParallel/DistributedArrays.jl/badge.svg?branch=master)](https://coveralls.io/github/JuliaParallel/DistributedArrays.jl?branch=master) 5 | [![codecov](https://codecov.io/gh/JuliaParallel/DistributedArrays.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaParallel/DistributedArrays.jl) 6 | 7 | Distributed Arrays for Julia 8 | 9 | ***NOTE*** 10 | Distributed Arrays will only work on Julia v0.4.0 or later. 11 | 12 | `DArray`s have been removed from Julia Base library in v0.4 so it is now necessary to import the `DistributedArrays` package on all spawned processes. 13 | 14 | ```julia 15 | @everywhere using DistributedArrays 16 | ``` 17 | 18 | Distributed Arrays 19 | ------------------ 20 | 21 | Large computations are often organized around large arrays of data. In 22 | these cases, a particularly natural way to obtain parallelism is to 23 | distribute arrays among several processes. This combines the memory 24 | resources of multiple machines, allowing use of arrays too large to fit 25 | on one machine. Each process operates on the part of the array it 26 | owns, providing a ready answer to the question of how a program should 27 | be divided among machines. 28 | 29 | Julia distributed arrays are implemented by the `DArray` type. A 30 | `DArray` has an element type and dimensions just like an `Array`. 31 | A `DArray` can also use arbitrary array-like types to represent the local 32 | chunks that store actual data. The data in a `DArray` is distributed by 33 | dividing the index space into some number of blocks in each dimension. 34 | 35 | Common kinds of arrays can be constructed with functions beginning with 36 | `d`: 37 | 38 | ```julia 39 | dzeros(100,100,10) 40 | dones(100,100,10) 41 | drand(100,100,10) 42 | drandn(100,100,10) 43 | dfill(x,100,100,10) 44 | ``` 45 | 46 | In the last case, each element will be initialized to the specified 47 | value `x`. These functions automatically pick a distribution for you. 48 | For more control, you can specify which processes to use, and how the 49 | data should be distributed: 50 | 51 | ```julia 52 | dzeros((100,100), workers()[1:4], [1,4]) 53 | ``` 54 | 55 | The second argument specifies that the array should be created on the first 56 | four workers. When dividing data among a large number of processes, 57 | one often sees diminishing returns in performance. Placing `DArray`\ s 58 | on a subset of processes allows multiple `DArray` computations to 59 | happen at once, with a higher ratio of work to communication on each 60 | process. 61 | 62 | The third argument specifies a distribution; the nth element of 63 | this array specifies how many pieces dimension n should be divided into. 64 | In this example the first dimension will not be divided, and the second 65 | dimension will be divided into 4 pieces. Therefore each local chunk will be 66 | of size `(100,25)`. Note that the product of the distribution array must 67 | equal the number of processes. 68 | 69 | * `distribute(a::Array)` converts a local array to a distributed array. 70 | 71 | * `localpart(d::DArray)` obtains the locally-stored portion 72 | of a `DArray`. 73 | 74 | * Localparts can be retrived and set via the indexing syntax too. 75 | Indexing via symbols is used for this, specifically symbols `:L`,`:LP`,`:l`,`:lp` which 76 | are all equivalent. For example, `d[:L]` returns the localpart of `d` 77 | while `d[:L]=v` sets `v` as the localpart of `d`. 78 | 79 | * `localindexes(a::DArray)` gives a tuple of the index ranges owned by the 80 | local process. 81 | 82 | * `convert(Array, a::DArray)` brings all the data to the local process. 83 | 84 | Indexing a `DArray` (square brackets) with ranges of indexes always 85 | creates a `SubArray`, not copying any data. 86 | 87 | 88 | Constructing Distributed Arrays 89 | ------------------------------- 90 | 91 | The primitive `DArray` constructor has the following somewhat elaborate signature: 92 | 93 | ```julia 94 | DArray(init, dims[, procs, dist]) 95 | ``` 96 | 97 | `init` is a function that accepts a tuple of index ranges. This function should 98 | allocate a local chunk of the distributed array and initialize it for the specified 99 | indices. `dims` is the overall size of the distributed array. 100 | `procs` optionally specifies a vector of process IDs to use. 101 | `dist` is an integer vector specifying how many chunks the 102 | distributed array should be divided into in each dimension. 103 | 104 | The last two arguments are optional, and defaults will be used if they 105 | are omitted. 106 | 107 | As an example, here is how to turn the local array constructor `fill` 108 | into a distributed array constructor: 109 | 110 | ```julia 111 | dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...) 112 | ``` 113 | 114 | In this case the `init` function only needs to call `fill` with the 115 | dimensions of the local piece it is creating. 116 | 117 | `DArray`s can also be constructed from multidimensional `Array` comprehensions with 118 | the `@DArray` macro syntax. This syntax is just sugar for the primitive `DArray` constructor: 119 | 120 | ```julia 121 | julia> [i+j for i = 1:5, j = 1:5] 122 | 5x5 Array{Int64,2}: 123 | 2 3 4 5 6 124 | 3 4 5 6 7 125 | 4 5 6 7 8 126 | 5 6 7 8 9 127 | 6 7 8 9 10 128 | 129 | julia> @DArray [i+j for i = 1:5, j = 1:5] 130 | 5x5 DistributedArrays.DArray{Int64,2,Array{Int64,2}}: 131 | 2 3 4 5 6 132 | 3 4 5 6 7 133 | 4 5 6 7 8 134 | 5 6 7 8 9 135 | 6 7 8 9 10 136 | ``` 137 | 138 | Distributed Array Operations 139 | ---------------------------- 140 | 141 | At this time, distributed arrays do not have much functionality. Their 142 | major utility is allowing communication to be done via array indexing, which 143 | is convenient for many problems. As an example, consider implementing the 144 | "life" cellular automaton, where each cell in a grid is updated according 145 | to its neighboring cells. To compute a chunk of the result of one iteration, 146 | each process needs the immediate neighbor cells of its local chunk. The 147 | following code accomplishes this:: 148 | 149 | ```julia 150 | function life_step(d::DArray) 151 | DArray(size(d),procs(d)) do I 152 | top = mod(first(I[1])-2,size(d,1))+1 153 | bot = mod( last(I[1]) ,size(d,1))+1 154 | left = mod(first(I[2])-2,size(d,2))+1 155 | right = mod( last(I[2]) ,size(d,2))+1 156 | 157 | old = Array(Bool, length(I[1])+2, length(I[2])+2) 158 | old[1 , 1 ] = d[top , left] # left side 159 | old[2:end-1, 1 ] = d[I[1], left] 160 | old[end , 1 ] = d[bot , left] 161 | old[1 , 2:end-1] = d[top , I[2]] 162 | old[2:end-1, 2:end-1] = d[I[1], I[2]] # middle 163 | old[end , 2:end-1] = d[bot , I[2]] 164 | old[1 , end ] = d[top , right] # right side 165 | old[2:end-1, end ] = d[I[1], right] 166 | old[end , end ] = d[bot , right] 167 | 168 | life_rule(old) 169 | end 170 | end 171 | ``` 172 | 173 | As you can see, we use a series of indexing expressions to fetch 174 | data into a local array `old`. Note that the `do` block syntax is 175 | convenient for passing `init` functions to the `DArray` constructor. 176 | Next, the serial function `life_rule` is called to apply the update rules 177 | to the data, yielding the needed `DArray` chunk. Nothing about `life_rule` 178 | is `DArray`\ -specific, but we list it here for completeness:: 179 | 180 | ```julia 181 | function life_rule(old) 182 | m, n = size(old) 183 | new = similar(old, m-2, n-2) 184 | for j = 2:n-1 185 | for i = 2:m-1 186 | nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1], 187 | old[i ,j-1], old[i ,j+1], 188 | old[i+1,j-1], old[i+1,j], old[i+1,j+1]) 189 | new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j]) 190 | end 191 | end 192 | new 193 | end 194 | ``` 195 | 196 | Numerical Results of Distributed Computations 197 | --------------------------------------------- 198 | 199 | Floating point arithmetic is not associative and this comes up 200 | when performing distributed computations over `DArray`s. All `DArray` 201 | operations are performed over the `localpart` chunks and then aggregated. 202 | The change in ordering of the operations will change the numeric result as 203 | seen in this simple example: 204 | 205 | ```julia 206 | julia> addprocs(8); 207 | 208 | julia> @everywhere using DistributedArrays 209 | 210 | julia> A = fill(1.1, (100,100)); 211 | 212 | julia> sum(A) 213 | 11000.000000000013 214 | 215 | julia> DA = distribute(A); 216 | 217 | julia> sum(DA) 218 | 11000.000000000127 219 | 220 | julia> sum(A) == sum(DA) 221 | false 222 | ``` 223 | 224 | The ultimate ordering of operations will be dependent on how the Array is distributed. 225 | 226 | Garbage Collection and DArrays 227 | ------------------------------ 228 | 229 | When a DArray is constructed (typically on the master process), the returned DArray objects stores information on how the 230 | array is distributed, which procesor holds which indexes and so on. When the DArray object 231 | on the master process is garbage collected, all particpating workers are notified and 232 | localparts of the DArray freed on each worker. 233 | 234 | Since the size of the DArray object itself is small, a problem arises as `gc` on the master faces no memory pressure to 235 | collect the DArray immediately. This results in a delay of the memory being released on the participating workers. 236 | 237 | Therefore it is highly recommended to explcitly call `close(d::DArray)` as soon as user code 238 | has finished working with the distributed array. 239 | 240 | It is also important to note that the localparts of the DArray is collected from all particpating workers 241 | when the DArray object on the process creating the DArray is collected. It is therefore important to maintain 242 | a reference to a DArray object on the creating process for as long as it is being computed upon. 243 | 244 | `darray_closeall()` is another useful function to manage distributed memory. It releases all darrays created from 245 | the calling process, including any temporaries created during computation. 246 | 247 | Working with distributed non-array data 248 | --------------------------------------- 249 | 250 | The function `ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])` can be used 251 | to created a distributed vector whose localparts need not be Arrays. 252 | 253 | It returns a `DArray{T,1,T}`, i.e., the element type and localtype of the array are the same. 254 | 255 | `ddata()` constructs a distributed vector of length `nworkers()` where each localpart can hold any value, 256 | initially initialized to `nothing`. 257 | 258 | Argument `data` if supplied is distributed over the `pids`. `length(data)` must be a multiple of `length(pids)`. 259 | If the multiple is 1, returns a `DArray{T,1,T}` where T is `eltype(data)`. If the multiple is greater than 1, 260 | returns a `DArray{T,1,Array{T,1}}`, i.e., it is equivalent to calling `distribute(data)`. 261 | 262 | `gather{T}(d::DArray{T,1,T})` returns an Array{T,1} consisting of all distributed elements of `d` 263 | 264 | Given a `DArray{T,1,T}` object `d`, `d[:L]` returns the localpart on a worker. `d[i]` returns the `localpart` 265 | on the ith worker that `d` is distributed over. 266 | 267 | SPMD Mode (An MPI Style SPMD mode with MPI like primitives) 268 | ------------------------------------------------------------ 269 | SPMD, i.e., a Single Program Multiple Data mode is implemented by submodule `DistributedArrays.SPMD`. In this mode the same function is executed in parallel on all participating nodes. This is a typical style of MPI programs where the same program is executed on all processors. A basic subset of MPI-like primitives are currently supported. As a programming model it should be familiar to folks with an MPI background. 270 | 271 | The same block of code is executed concurrently on all workers using the `spmd` function. 272 | 273 | ``` 274 | # define foo() on all workers 275 | @everywhere function foo(arg1, arg2) 276 | .... 277 | end 278 | 279 | # call foo() everywhere using the `spmd` function 280 | d_in=DArray(.....) 281 | d_out=ddata() 282 | spmd(foo,d_in,d_out; pids=workers()) # executes on all workers 283 | ``` 284 | 285 | `spmd` is defined as `spmd(f, args...; pids=procs(), context=nothing)` 286 | 287 | `args` is one or more arguments to be passed to `f`. `pids` identifies the workers 288 | that `f` needs to be run on. `context` identifies a run context, which is explained 289 | later. 290 | 291 | The following primitives can be used in SPMD mode. 292 | 293 | - `sendto(pid, data; tag=nothing)` - sends `data` to `pid` 294 | 295 | - `recvfrom(pid; tag=nothing)` - receives data from `pid` 296 | 297 | - `recvfrom_any(; tag=nothing)` - receives data from any `pid` 298 | 299 | - `barrier(;pids=procs(), tag=nothing)` - all tasks wait and then proceeed 300 | 301 | - `bcast(data, pid; tag=nothing, pids=procs())` - broadcasts the same data over `pids` from `pid` 302 | 303 | - `scatter(x, pid; tag=nothing, pids=procs())` - distributes `x` over `pids` from `pid` 304 | 305 | - `gather(x, pid; tag=nothing, pids=procs())` - collects data from `pids` onto worker `pid` 306 | 307 | Tag `tag` should be used to differentiate between consecutive calls of the same type, for example, 308 | consecutive `bcast` calls. 309 | 310 | `spmd` and spmd related functions are defined in submodule `DistributedArrays.SPMD`. You will need to 311 | import it explcitly, or prefix functions that can can only be used in spmd mode with `SPMD.`, for example, 312 | `SPMD.sendto`. 313 | 314 | Example 315 | ------- 316 | 317 | This toy example exchanges data with each of its neighbors `n` times. 318 | 319 | ``` 320 | using DistributedArrays 321 | addprocs(8) 322 | @everywhere importall DistributedArrays 323 | @everywhere importall DistributedArrays.SPMD 324 | 325 | d_in=d=DArray(I->fill(myid(), (map(length,I)...)), (nworkers(), 2), workers(), [nworkers(),1]) 326 | d_out=ddata() 327 | 328 | # define the function everywhere 329 | @everywhere function foo_spmd(d_in, d_out, n) 330 | pids = sort(vec(procs(d_in))) 331 | pididx = findfirst(pids, myid()) 332 | mylp = d_in[:L] 333 | localsum = 0 334 | 335 | # Have each worker exchange data with its neighbors 336 | n_pididx = pididx+1 > length(pids) ? 1 : pididx+1 337 | p_pididx = pididx-1 < 1 ? length(pids) : pididx-1 338 | 339 | for i in 1:n 340 | sendto(pids[n_pididx], mylp[2]) 341 | sendto(pids[p_pididx], mylp[1]) 342 | 343 | mylp[2] = recvfrom(pids[p_pididx]) 344 | mylp[1] = recvfrom(pids[n_pididx]) 345 | 346 | barrier(;pids=pids) 347 | localsum = localsum + mylp[1] + mylp[2] 348 | end 349 | 350 | # finally store the sum in d_out 351 | d_out[:L] = localsum 352 | end 353 | 354 | # run foo_spmd on all workers 355 | spmd(foo_spmd, d_in, d_out, 10) 356 | 357 | # print values of d_in and d_out after the run 358 | println(d_in) 359 | println(d_out) 360 | ``` 361 | 362 | SPMD Context 363 | ------------ 364 | 365 | Each SPMD run is implictly executed in a different context. This allows for multiple `spmd` calls to 366 | be active at the same time. A SPMD context can be explicitly specified via keyword arg `context` to `spmd`. 367 | 368 | `context(pids=procs())` returns a new SPMD context. 369 | 370 | A SPMD context also provides a context local storage, a dict, which can be used to store 371 | key-value pairs between spmd runs under the same context. 372 | 373 | `context_local_storage()` returns the dictionary associated with the context. 374 | 375 | NOTE: Implicitly defined contexts, i.e., `spmd` calls without specifying a `context` create a context 376 | which live only for the duration of the call. Explictly created context objects can be released 377 | early by calling `close(ctxt::SPMDContext)`. This will release the local storage dictionaries 378 | on all participating `pids`. Else they will be released when the context object is gc'ed 379 | on the node that created it. 380 | 381 | 382 | Nested `spmd` calls 383 | ------------------- 384 | As `spmd` executes the the specified function on all participating nodes, we need to be careful with nesting `spmd` calls. 385 | 386 | An example of an unsafe(wrong) way: 387 | ``` 388 | function foo(.....) 389 | ...... 390 | spmd(bar, ......) 391 | ...... 392 | end 393 | 394 | function bar(....) 395 | ...... 396 | spmd(baz, ......) 397 | ...... 398 | end 399 | 400 | spmd(foo,....) 401 | ``` 402 | In the above example, `foo`, `bar` and `baz` are all functions wishing to leverage distributed computation. However, they themselves may be currenty part of a `spmd` call. A safe way to handle such a scenario is to only drive parallel computation from the master process. 403 | 404 | The correct way (only have the driver process initiate `spmd` calls): 405 | ``` 406 | function foo() 407 | ...... 408 | myid()==1 && spmd(bar, ......) 409 | ...... 410 | end 411 | 412 | function bar() 413 | ...... 414 | myid()==1 && spmd(baz, ......) 415 | ...... 416 | end 417 | 418 | spmd(foo,....) 419 | ``` 420 | 421 | This is also true of functions which automatically distribute computation on DArrays. 422 | ``` 423 | function foo(d::DArray) 424 | ...... 425 | myid()==1 && map!(bar, d) 426 | ...... 427 | end 428 | spmd(foo,....) 429 | ``` 430 | Without the `myid()` check, the `spmd` call to `foo` would execute `map!` from all nodes, which is not what we probably want. 431 | 432 | Similarly `@everywhere` from within a SPMD run should also be driven from the master node only. 433 | -------------------------------------------------------------------------------- /src/darray.jl: -------------------------------------------------------------------------------- 1 | """ 2 | DArray(init, dims, [procs, dist]) 3 | 4 | Construct a distributed array. 5 | 6 | The parameter `init` is a function that accepts a tuple of index ranges. 7 | This function should allocate a local chunk of the distributed array and initialize it for the specified indices. 8 | 9 | `dims` is the overall size of the distributed array. 10 | 11 | `procs` optionally specifies a vector of process IDs to use. 12 | If unspecified, the array is distributed over all worker processes only. Typically, when running in distributed mode, 13 | i.e., nprocs() > 1, this would mean that no chunk of the distributed array exists on the process hosting the 14 | interactive julia prompt. 15 | 16 | `dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension. 17 | 18 | For example, the `dfill` function that creates a distributed array and fills it with a value `v` is implemented as: 19 | 20 | ### Example 21 | ```jl 22 | dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...) 23 | ``` 24 | """ 25 | type DArray{T,N,A} <: AbstractArray{T,N} 26 | id::Tuple 27 | dims::NTuple{N,Int} 28 | pids::Array{Int,N} # pids[i]==p ⇒ processor p has piece i 29 | indexes::Array{NTuple{N,UnitRange{Int}},N} # indexes held by piece i 30 | cuts::Vector{Vector{Int}} # cuts[d][i] = first index of chunk i in dimension d 31 | localpart::Nullable{A} 32 | 33 | release::Bool 34 | 35 | function DArray(id, dims, pids, indexes, cuts, lp) 36 | # check invariants 37 | if dims != map(last, last(indexes)) 38 | throw(ArgumentError("dimension of DArray (dim) and indexes do not match")) 39 | end 40 | release = (myid() == id[1]) 41 | 42 | haskey(registry, id) && return registry[id] 43 | 44 | d = new(id, dims, pids, indexes, cuts, lp, release) 45 | if release 46 | push!(refs, id) 47 | registry[id] = d 48 | 49 | # println("Installing finalizer for : ", d.id, ", : ", object_id(d), ", isbits: ", isbits(d)) 50 | finalizer(d, close) 51 | end 52 | d 53 | end 54 | 55 | DArray() = new() 56 | end 57 | 58 | eltype{T}(::Type{DArray{T}}) = T 59 | empty_localpart(T,N,A) = convert(A, Array{T}(ntuple(zero, N))) 60 | 61 | typealias SubDArray{T,N,D<:DArray} SubArray{T,N,D} 62 | typealias SubOrDArray{T,N} Union{DArray{T,N}, SubDArray{T,N}} 63 | 64 | localtype{T,N,S}(::Type{DArray{T,N,S}}) = S 65 | localtype{T,N,D}(::Type{SubDArray{T,N,D}}) = localtype(D) 66 | localtype(A::SubOrDArray) = localtype(typeof(A)) 67 | localtype(A::AbstractArray) = typeof(A) 68 | 69 | ## core constructors ## 70 | 71 | function DArray(id, init, dims, pids, idxs, cuts) 72 | r=Channel(1) 73 | @sync begin 74 | for i = 1:length(pids) 75 | @async begin 76 | local typA 77 | if isa(init, Function) 78 | typA=remotecall_fetch(construct_localparts, pids[i], init, id, dims, pids, idxs, cuts) 79 | else 80 | # constructing from an array of remote refs. 81 | typA=remotecall_fetch(construct_localparts, pids[i], init[i], id, dims, pids, idxs, cuts) 82 | end 83 | !isready(r) && put!(r, typA) 84 | end 85 | end 86 | end 87 | 88 | A = take!(r) 89 | if myid() in pids 90 | d = registry[id] 91 | else 92 | T = eltype(A) 93 | N = length(dims) 94 | d = DArray{T,N,A}(id, dims, pids, idxs, cuts, empty_localpart(T,N,A)) 95 | end 96 | d 97 | end 98 | 99 | function construct_localparts(init, id, dims, pids, idxs, cuts; T=nothing, A=nothing) 100 | localpart = isa(init, Function) ? init(idxs[localpartindex(pids)]) : fetch(init) 101 | if A == nothing 102 | A = typeof(localpart) 103 | end 104 | if T == nothing 105 | T = eltype(A) 106 | end 107 | N = length(dims) 108 | d = DArray{T,N,A}(id, dims, pids, idxs, cuts, localpart) 109 | registry[id] = d 110 | A 111 | end 112 | 113 | function ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[]) 114 | pids=sort(vec(pids)) 115 | id = next_did() 116 | npids = length(pids) 117 | ldata = length(data) 118 | idxs, cuts = chunk_idxs([npids], [npids]) 119 | 120 | if ldata > 0 121 | @assert rem(ldata,npids) == 0 122 | if ldata == npids 123 | T = eltype(data) 124 | s = DestinationSerializer(pididx->data[pididx], pids) 125 | init = I->localpart(s) 126 | else 127 | # call the standard distribute function 128 | return distribute(data) 129 | end 130 | end 131 | 132 | @sync for i = 1:length(pids) 133 | @async remotecall_fetch(construct_localparts, pids[i], init, id, (npids,), pids, idxs, cuts; T=T, A=T) 134 | end 135 | 136 | if myid() in pids 137 | d = registry[id] 138 | else 139 | d = DArray{T,1,T}(id, (npids,), pids, idxs, cuts, Nullable{T}()) 140 | end 141 | d 142 | end 143 | 144 | function gather{T}(d::DArray{T,1,T}) 145 | a=Array{T}(length(procs(d))) 146 | @sync for (i,p) in enumerate(procs(d)) 147 | @async a[i] = remotecall_fetch(localpart, p, d) 148 | end 149 | a 150 | end 151 | 152 | function DArray(init, dims, procs, dist) 153 | np = prod(dist) 154 | procs = reshape(procs[1:np], ntuple(i->dist[i], length(dist))) 155 | idxs, cuts = chunk_idxs([dims...], dist) 156 | id = next_did() 157 | 158 | return DArray(id, init, dims, procs, idxs, cuts) 159 | end 160 | 161 | function DArray(init, dims, procs) 162 | if isempty(procs) 163 | throw(ArgumentError("no processors given")) 164 | end 165 | return DArray(init, dims, procs, defaultdist(dims, procs)) 166 | end 167 | DArray(init, dims) = DArray(init, dims, workers()[1:min(nworkers(), maximum(dims))]) 168 | 169 | # Create a DArray from a collection of references 170 | # The refs must have the same layout as the parts distributed. 171 | # i.e. 172 | # size(refs) must specify the distribution of dimensions across processors 173 | # prod(size(refs)) must equal number of parts 174 | # FIXME : Empty parts are currently not supported. 175 | function DArray(refs) 176 | dimdist = size(refs) 177 | id = next_did() 178 | 179 | npids = [r.where for r in refs] 180 | nsizes = Array{Tuple}(dimdist) 181 | @sync for i in 1:length(refs) 182 | let i=i 183 | @async nsizes[i] = remotecall_fetch(sz_localpart_ref, npids[i], refs[i], id) 184 | end 185 | end 186 | 187 | nindexes = Array{NTuple{length(dimdist),UnitRange{Int}}}(dimdist...) 188 | 189 | for i in 1:length(nindexes) 190 | subidx = ind2sub(dimdist, i) 191 | nindexes[i] = ntuple(length(subidx)) do x 192 | idx_in_dim = subidx[x] 193 | startidx = 1 194 | for j in 1:(idx_in_dim-1) 195 | prevsubidx = ntuple(y -> y == x ? j : subidx[y], length(subidx)) 196 | prevsize = nsizes[prevsubidx...] 197 | startidx += prevsize[x] 198 | end 199 | startidx:startidx+(nsizes[i][x])-1 200 | end 201 | end 202 | 203 | lastidxs = hcat([Int[last(idx_in_d)+1 for idx_in_d in idx] for idx in nindexes]...) 204 | ncuts = Array{Int,1}[unshift!(sort(unique(lastidxs[x,:])), 1) for x in 1:length(dimdist)] 205 | ndims = tuple([sort(unique(lastidxs[x,:]))[end]-1 for x in 1:length(dimdist)]...) 206 | 207 | DArray(id, refs, ndims, reshape(npids, dimdist), nindexes, ncuts) 208 | end 209 | 210 | macro DArray(ex0::Expr) 211 | if ex0.head !== :comprehension 212 | throw(ArgumentError("invalid @DArray syntax")) 213 | end 214 | ex = ex0.args[1] 215 | if ex.head !== :generator 216 | throw(ArgumentError("invalid @DArray syntax")) 217 | end 218 | ex.args[1] = esc(ex.args[1]) 219 | ndim = length(ex.args) - 1 220 | ranges = map(r->esc(r.args[2]), ex.args[2:end]) 221 | for d = 1:ndim 222 | var = ex.args[d+1].args[1] 223 | ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] ) 224 | end 225 | return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0), 226 | tuple($(map(r->:(length($r)), ranges)...))) ) 227 | end 228 | 229 | # new DArray similar to an existing one 230 | DArray(init, d::DArray) = DArray(next_did(), init, size(d), procs(d), d.indexes, d.cuts) 231 | 232 | sz_localpart_ref(ref, id) = size(fetch(ref)) 233 | 234 | Base.similar(d::DArray, T::Type, dims::Dims) = DArray(I->Array{T}(map(length,I)), dims, procs(d)) 235 | Base.similar(d::DArray, T::Type) = similar(d, T, size(d)) 236 | Base.similar{T}(d::DArray{T}, dims::Dims) = similar(d, T, dims) 237 | Base.similar{T}(d::DArray{T}) = similar(d, T, size(d)) 238 | 239 | Base.size(d::DArray) = d.dims 240 | 241 | chunktype{T,N,A}(d::DArray{T,N,A}) = A 242 | 243 | ## chunk index utilities ## 244 | 245 | # decide how to divide each dimension 246 | # returns size of chunks array 247 | function defaultdist(dims, pids) 248 | dims = [dims...] 249 | chunks = ones(Int, length(dims)) 250 | np = length(pids) 251 | f = sort!(collect(keys(factor(np))), rev=true) 252 | k = 1 253 | while np > 1 254 | # repeatedly allocate largest factor to largest dim 255 | if np % f[k] != 0 256 | k += 1 257 | if k > length(f) 258 | break 259 | end 260 | end 261 | fac = f[k] 262 | (d, dno) = findmax(dims) 263 | # resolve ties to highest dim 264 | dno = last(find(dims .== d)) 265 | if dims[dno] >= fac 266 | dims[dno] = div(dims[dno], fac) 267 | chunks[dno] *= fac 268 | end 269 | np = div(np, fac) 270 | end 271 | return chunks 272 | end 273 | 274 | # get array of start indexes for dividing sz into nc chunks 275 | function defaultdist(sz::Int, nc::Int) 276 | if sz >= nc 277 | return round.(Int, linspace(1, sz+1, nc+1)) 278 | else 279 | return [[1:(sz+1);], zeros(Int, nc-sz);] 280 | end 281 | end 282 | 283 | # compute indexes array for dividing dims into chunks 284 | function chunk_idxs(dims, chunks) 285 | cuts = map(defaultdist, dims, chunks) 286 | n = length(dims) 287 | idxs = Array{NTuple{n,UnitRange{Int}}}(chunks...) 288 | for cidx in CartesianRange(tuple(chunks...)) 289 | idxs[cidx.I...] = ntuple(i -> (cuts[i][cidx[i]]:cuts[i][cidx[i] + 1] - 1), n) 290 | end 291 | return (idxs, cuts) 292 | end 293 | 294 | function localpartindex(pids::Array{Int}) 295 | mi = myid() 296 | for i = 1:length(pids) 297 | if pids[i] == mi 298 | return i 299 | end 300 | end 301 | return 0 302 | end 303 | localpartindex(d::DArray) = localpartindex(procs(d)) 304 | 305 | """ 306 | localpart(d::DArray) 307 | 308 | Get the local piece of a distributed array. 309 | Returns an empty array if no local part exists on the calling process. 310 | 311 | d[:L], d[:l], d[:LP], d[:lp] are an alternative means to get localparts. 312 | This syntaxt can also be used for assignment. For example, 313 | `d[:L]=v` will assign `v` to the localpart of `d`. 314 | """ 315 | function localpart{T,N,A}(d::DArray{T,N,A}) 316 | lpidx = localpartindex(d) 317 | if lpidx == 0 318 | return empty_localpart(T,N,A)::A 319 | end 320 | 321 | return get(registry[d.id].localpart)::A 322 | end 323 | 324 | localpart(d::DArray, localidx...) = localpart(d)[localidx...] 325 | 326 | # shortcut to set/get localparts of a distributed object 327 | function Base.getindex(d::DArray, s::Symbol) 328 | @assert s in [:L, :l, :LP, :lp] 329 | return localpart(d) 330 | end 331 | 332 | function Base.setindex!{T,N,A}(d::DArray{T,N,A}, new_lp::A, s::Symbol) 333 | @assert s in [:L, :l, :LP, :lp] 334 | d.localpart = new_lp 335 | new_lp 336 | end 337 | 338 | 339 | # fetch localpart of d at pids[i] 340 | fetch{T,N,A}(d::DArray{T,N,A}, i) = remotecall_fetch(localpart, d.pids[i], d) 341 | 342 | """ 343 | localindexes(d) 344 | 345 | A tuple describing the indexes owned by the local process. 346 | Returns a tuple with empty ranges if no local part exists on the calling process. 347 | """ 348 | function localindexes(d::DArray) 349 | lpidx = localpartindex(d) 350 | if lpidx == 0 351 | return ntuple(i -> 1:0, ndims(d)) 352 | end 353 | return d.indexes[lpidx] 354 | end 355 | 356 | # find which piece holds index (I...) 357 | locate(d::DArray, I::Int...) = 358 | ntuple(i -> searchsortedlast(d.cuts[i], I[i]), ndims(d)) 359 | 360 | chunk{T,N,A}(d::DArray{T,N,A}, i...) = remotecall_fetch(localpart, d.pids[i...], d)::A 361 | 362 | ## convenience constructors ## 363 | 364 | """ 365 | dzeros(dims, ...) 366 | 367 | Construct a distributed array of zeros. 368 | Trailing arguments are the same as those accepted by `DArray`. 369 | """ 370 | dzeros(dims::Dims, args...) = DArray(I->zeros(map(length,I)), dims, args...) 371 | dzeros{T}(::Type{T}, dims::Dims, args...) = DArray(I->zeros(T,map(length,I)), dims, args...) 372 | dzeros{T}(::Type{T}, d1::Integer, drest::Integer...) = dzeros(T, convert(Dims, tuple(d1, drest...))) 373 | dzeros(d1::Integer, drest::Integer...) = dzeros(Float64, convert(Dims, tuple(d1, drest...))) 374 | dzeros(d::Dims) = dzeros(Float64, d) 375 | 376 | 377 | """ 378 | dones(dims, ...) 379 | 380 | Construct a distributed array of ones. 381 | Trailing arguments are the same as those accepted by `DArray`. 382 | """ 383 | dones(dims::Dims, args...) = DArray(I->ones(map(length,I)), dims, args...) 384 | dones{T}(::Type{T}, dims::Dims, args...) = DArray(I->ones(T,map(length,I)), dims, args...) 385 | dones{T}(::Type{T}, d1::Integer, drest::Integer...) = dones(T, convert(Dims, tuple(d1, drest...))) 386 | dones(d1::Integer, drest::Integer...) = dones(Float64, convert(Dims, tuple(d1, drest...))) 387 | dones(d::Dims) = dones(Float64, d) 388 | 389 | """ 390 | dfill(x, dims, ...) 391 | 392 | Construct a distributed array filled with value `x`. 393 | Trailing arguments are the same as those accepted by `DArray`. 394 | """ 395 | dfill(v, dims::Dims, args...) = DArray(I->fill(v, map(length,I)), dims, args...) 396 | dfill(v, d1::Integer, drest::Integer...) = dfill(v, convert(Dims, tuple(d1, drest...))) 397 | 398 | """ 399 | drand(dims, ...) 400 | 401 | Construct a distributed uniform random array. 402 | Trailing arguments are the same as those accepted by `DArray`. 403 | """ 404 | drand(r, dims::Dims, args...) = DArray(I -> rand(r, map(length,I)), dims, args...) 405 | drand(r, d1::Integer, drest::Integer...) = drand(r, convert(Dims, tuple(d1, drest...))) 406 | drand(d1::Integer, drest::Integer...) = drand(Float64, convert(Dims, tuple(d1, drest...))) 407 | drand(d::Dims, args...) = drand(Float64, d, args...) 408 | 409 | """ 410 | drandn(dims, ...) 411 | 412 | Construct a distributed normal random array. 413 | Trailing arguments are the same as those accepted by `DArray`. 414 | """ 415 | drandn(dims::Dims, args...) = DArray(I->randn(map(length,I)), dims, args...) 416 | drandn(d1::Integer, drest::Integer...) = drandn(convert(Dims, tuple(d1, drest...))) 417 | 418 | ## conversions ## 419 | 420 | """ 421 | distribute(A[; procs, dist]) 422 | 423 | Convert a local array to distributed. 424 | 425 | `procs` optionally specifies an array of process IDs to use. (defaults to all workers) 426 | `dist` optionally specifies a vector or tuple of the number of partitions in each dimension 427 | """ 428 | function distribute(A::AbstractArray; 429 | procs = workers()[1:min(nworkers(), maximum(size(A)))], 430 | dist = defaultdist(size(A), procs)) 431 | np = prod(dist) 432 | procs_used = procs[1:np] 433 | idxs, _ = chunk_idxs([size(A)...], dist) 434 | 435 | s = verified_destination_serializer(reshape(procs_used, size(idxs)), size(idxs)) do pididx 436 | A[idxs[pididx]...] 437 | end 438 | return DArray(I->localpart(s), size(A), procs_used, dist) 439 | end 440 | 441 | """ 442 | distribute(A, DA) 443 | 444 | Distribute a local array `A` like the distributed array `DA`. 445 | 446 | """ 447 | function distribute(A::AbstractArray, DA::DArray) 448 | size(DA) == size(A) || throw(DimensionMismatch("Distributed array has size $(size(DA)) but array has $(size(A))")) 449 | 450 | s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx 451 | A[DA.indexes[pididx]...] 452 | end 453 | return DArray(I->localpart(s), DA) 454 | end 455 | 456 | Base.convert{T,N,S<:AbstractArray}(::Type{DArray{T,N,S}}, A::S) = distribute(convert(AbstractArray{T,N}, A)) 457 | 458 | Base.convert{S,T,N}(::Type{Array{S,N}}, d::DArray{T,N}) = begin 459 | a = Array{S}(size(d)) 460 | @sync begin 461 | for i = 1:length(d.pids) 462 | @async a[d.indexes[i]...] = chunk(d, i) 463 | end 464 | end 465 | return a 466 | end 467 | 468 | Base.convert{S,T,N}(::Type{Array{S,N}}, s::SubDArray{T,N}) = begin 469 | I = s.indexes 470 | d = s.parent 471 | if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S 472 | l = locate(d, map(first, I)...) 473 | if isequal(d.indexes[l...], I) 474 | # SubDArray corresponds to a chunk 475 | return chunk(d, l...) 476 | end 477 | end 478 | a = Array{S}(size(s)) 479 | a[[1:size(a,i) for i=1:N]...] = s 480 | return a 481 | end 482 | 483 | function Base.convert{T,N}(::Type{DArray}, SD::SubArray{T,N}) 484 | D = SD.parent 485 | DArray(size(SD), procs(D)) do I 486 | TR = typeof(SD.indexes[1]) 487 | lindices = Array{TR}(0) 488 | for (i,r) in zip(I, SD.indexes) 489 | st = step(r) 490 | lrstart = first(r) + st*(first(i)-1) 491 | lrend = first(r) + st*(last(i)-1) 492 | if TR <: UnitRange 493 | push!(lindices, lrstart:lrend) 494 | else 495 | push!(lindices, lrstart:st:lrend) 496 | end 497 | end 498 | convert(Array, D[lindices...]) 499 | end 500 | end 501 | 502 | Base.reshape{T,S<:Array}(A::DArray{T,1,S}, d::Dims) = begin 503 | if prod(d) != length(A) 504 | throw(DimensionMismatch("dimensions must be consistent with array size")) 505 | end 506 | return DArray(d) do I 507 | sz = map(length,I) 508 | d1offs = first(I[1]) 509 | nd = length(I) 510 | 511 | B = Array{T}(sz) 512 | nr = size(B,1) 513 | sztail = size(B)[2:end] 514 | 515 | for i=1:div(length(B),nr) 516 | i2 = ind2sub(sztail, i) 517 | globalidx = [ I[j][i2[j-1]] for j=2:nd ] 518 | 519 | a = sub2ind(d, d1offs, globalidx...) 520 | 521 | B[:,i] = A[a:(a+nr-1)] 522 | end 523 | B 524 | end 525 | end 526 | 527 | ## indexing ## 528 | 529 | getlocalindex(d::DArray, idx...) = localpart(d)[idx...] 530 | function getindex_tuple{T}(d::DArray{T}, I::Tuple{Vararg{Int}}) 531 | chidx = locate(d, I...) 532 | idxs = d.indexes[chidx...] 533 | localidx = ntuple(i -> (I[i] - first(idxs[i]) + 1), ndims(d)) 534 | pid = d.pids[chidx...] 535 | return remotecall_fetch(getlocalindex, pid, d, localidx...)::T 536 | end 537 | 538 | Base.getindex(d::DArray, i::Int) = getindex_tuple(d, ind2sub(size(d), i)) 539 | Base.getindex(d::DArray, i::Int...) = getindex_tuple(d, i) 540 | 541 | Base.getindex(d::DArray) = d[1] 542 | Base.getindex(d::DArray, I::Union{Int,UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) = view(d, I...) 543 | 544 | 545 | Base.copy!(dest::SubOrDArray, src::SubOrDArray) = begin 546 | asyncmap(procs(dest)) do p 547 | remotecall_fetch(p) do 548 | localpart(dest)[:] = src[localindexes(dest)...] 549 | end 550 | end 551 | return dest 552 | end 553 | 554 | # local copies are obtained by convert(Array, ) or assigning from 555 | # a SubDArray to a local Array. 556 | 557 | function Base.setindex!(a::Array, d::DArray, 558 | I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) 559 | n = length(I) 560 | @sync for i = 1:length(d.pids) 561 | K = d.indexes[i] 562 | @async a[[I[j][K[j]] for j=1:n]...] = chunk(d, i) 563 | end 564 | return a 565 | end 566 | 567 | # We also want to optimize setindex! with a SubDArray source, but this is hard 568 | # and only works on 0.5. 569 | 570 | # Similar to Base.indexin, but just create a logical mask. Note that this 571 | # must return a logical mask in order to support merging multiple masks 572 | # together into one linear index since we need to know how many elements to 573 | # skip at the end. In many cases range intersection would be much faster 574 | # than generating a logical mask, but that loses the endpoint information. 575 | indexin_mask(a, b::Number) = a .== b 576 | indexin_mask(a, r::Range{Int}) = [i in r for i in a] 577 | indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b)) 578 | indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b)) 579 | indexin_mask(a, b) = [i in b for i in a] 580 | 581 | import Base: tail 582 | # Given a tuple of indices and a tuple of masks, restrict the indices to the 583 | # valid regions. This is, effectively, reversing Base.setindex_shape_check. 584 | # We can't just use indexing into MergedIndices here because getindex is much 585 | # pickier about singleton dimensions than setindex! is. 586 | restrict_indices(::Tuple{}, ::Tuple{}) = () 587 | function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}}) 588 | if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1) 589 | (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...) 590 | elseif length(a[1]) == 1 591 | (a[1], restrict_indices(tail(a), b)) 592 | elseif length(b[1]) == 1 && b[1][1] 593 | restrict_indices(a, tail(b)) 594 | else 595 | throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue")) 596 | end 597 | end 598 | # The final indices are funky - they're allowed to accumulate together. 599 | # An easy (albeit very inefficient) fix for too many masks is to use the 600 | # outer product to merge them. But we can do that lazily with a custom type: 601 | function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}}) 602 | (vec(a[1])[vec(ProductIndices(b, map(length, b)))],) 603 | end 604 | # But too many indices is much harder; this requires merging the indices 605 | # in `a` before applying the final mask in `b`. 606 | function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any}) 607 | if length(a[1]) == 1 608 | (a[1], restrict_indices(tail(a), b)) 609 | else 610 | # When one mask spans multiple indices, we need to merge the indices 611 | # together. At this point, we can just use indexing to merge them since 612 | # there's no longer special handling of singleton dimensions 613 | (view(MergedIndices(a, map(length, a)), b[1]),) 614 | end 615 | end 616 | 617 | immutable ProductIndices{I,N} <: AbstractArray{Bool, N} 618 | indices::I 619 | sz::NTuple{N,Int} 620 | end 621 | Base.size(P::ProductIndices) = P.sz 622 | # This gets passed to map to avoid breaking propagation of inbounds 623 | Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...] 624 | Base.@propagate_inbounds Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) = 625 | Bool((&)(map(propagate_getindex, P.indices, I)...)) 626 | 627 | immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N} 628 | indices::I 629 | sz::NTuple{N,Int} 630 | end 631 | Base.size(M::MergedIndices) = M.sz 632 | Base.@propagate_inbounds Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = 633 | CartesianIndex(map(propagate_getindex, M.indices, I)) 634 | # Additionally, we optimize bounds checking when using MergedIndices as an 635 | # array index since checking, e.g., A[1:500, 1:500] is *way* faster than 636 | # checking an array of 500^2 elements of CartesianIndex{2}. This optimization 637 | # also applies to reshapes of MergedIndices since the outer shape of the 638 | # container doesn't affect the index elements themselves. We can go even 639 | # farther and say that even restricted views of MergedIndices must be valid 640 | # over the entire array. This is overly strict in general, but in this 641 | # use-case all the merged indices must be valid at some point, so it's ok. 642 | typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M} 643 | typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M} 644 | typealias MergedIndicesOrSub Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices} 645 | import Base: checkbounds_indices 646 | @inline checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = 647 | checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...)) 648 | @inline checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = 649 | checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...)) 650 | @inline checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = 651 | checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...)) 652 | 653 | # The tricky thing here is that we want to optimize the accesses into the 654 | # distributed array, but in doing so, we lose track of which indices in I we 655 | # should be using. 656 | # 657 | # I’ve come to the conclusion that the function is utterly insane. 658 | # There are *6* flavors of indices with four different reference points: 659 | # 1. Find the indices of each portion of the DArray. 660 | # 2. Find the valid subset of indices for the SubArray into that portion. 661 | # 3. Find the portion of the `I` indices that should be used when you access the 662 | # `K` indices in the subarray. This guy is nasty. It’s totally backwards 663 | # from all other arrays, wherein we simply iterate over the source array’s 664 | # elements. You need to *both* know which elements in `J` were skipped 665 | # (`indexin_mask`) and which dimensions should match up (`restrict_indices`) 666 | # 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of 667 | # the local portion of the source array 668 | function Base.setindex!(a::Array, s::SubDArray, 669 | I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) 670 | Inew = Base.to_indices(a, I) 671 | Base.setindex_shape_check(s, Base.index_lengths(Inew...)...) 672 | n = length(Inew) 673 | d = s.parent 674 | J = Base.to_indices(d, s.indexes) 675 | @sync for i = 1:length(d.pids) 676 | K_c = d.indexes[i] 677 | K = map(intersect, J, K_c) 678 | if !any(isempty, K) 679 | K_mask = map(indexin_mask, J, K_c) 680 | idxs = restrict_indices(Inew, K_mask) 681 | if isequal(K, K_c) 682 | # whole chunk 683 | @async a[idxs...] = chunk(d, i) 684 | else 685 | # partial chunk 686 | @async a[idxs...] = 687 | remotecall_fetch(d.pids[i]) do 688 | view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...) 689 | end 690 | end 691 | end 692 | end 693 | return a 694 | end 695 | 696 | Base.fill!(A::DArray, x) = begin 697 | @sync for p in procs(A) 698 | @async remotecall_fetch((A,x)->(fill!(localpart(A), x); nothing), p, A, x) 699 | end 700 | return A 701 | end 702 | -------------------------------------------------------------------------------- /test/darray.jl: -------------------------------------------------------------------------------- 1 | t=@testset "test distribute" begin 2 | A = rand(1:100, (100,100)) 3 | 4 | @testset "test default distribute" begin 5 | DA = distribute(A) 6 | @test length(procs(DA)) == nworkers() 7 | @test sum(DA) == sum(A) 8 | close(DA) 9 | end 10 | 11 | @testset "test distribute with procs arguments" begin 12 | DA = distribute(A, procs = procs()) 13 | @test length(procs(DA)) == nprocs() 14 | @test sum(DA) == sum(A) 15 | close(DA) 16 | end 17 | 18 | @testset "test distribute with procs and dist arguments" begin 19 | DA = distribute(A, procs = [1, 2], dist = [1,2]) 20 | @test size(procs(DA)) == (1,2) 21 | @test sum(DA) == sum(A) 22 | close(DA) 23 | end 24 | 25 | @testset "Create darray with unconventional distribution and distibute like it" begin 26 | block = 10 27 | Y = nworkers() * block 28 | X = nworkers() * block 29 | remote_parts = map(workers()) do wid 30 | remotecall(rand, wid, block, Y) 31 | end 32 | DA1 = DArray(reshape(remote_parts, (length(remote_parts), 1))) 33 | A = rand(X, Y) 34 | DA2 = distribute(A, DA1) 35 | 36 | @test size(DA1) == size(DA2) 37 | 38 | close(DA1) 39 | close(DA2) 40 | end 41 | end 42 | 43 | check_leaks(t) 44 | 45 | t=@testset "test DArray equality" begin 46 | D = drand((200,200), [MYID, OTHERIDS]) 47 | DC = copy(D) 48 | 49 | @testset "test isequal(::DArray, ::DArray)" begin 50 | @test D == DC 51 | end 52 | 53 | @testset "test copy(::DArray) does a copy of each localpart" begin 54 | @spawnat OTHERIDS localpart(DC)[1] = 0 55 | @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0) 56 | end 57 | 58 | close(D) 59 | close(DC) 60 | end 61 | 62 | check_leaks(t) 63 | 64 | t=@testset "test DArray similar" begin 65 | D = drand((200,200), [MYID, OTHERIDS]) 66 | DS = similar(D,Float16) 67 | 68 | @testset "test eltype of a similar" begin 69 | @test eltype(DS) == Float16 70 | end 71 | 72 | @testset "test dims of a similar" begin 73 | @test size(D) == size(DS) 74 | end 75 | close(D) 76 | close(DS) 77 | end 78 | 79 | check_leaks(t) 80 | 81 | t=@testset "test DArray reshape" begin 82 | D = drand((200,200), [MYID, OTHERIDS]) 83 | 84 | @testset "Test error-throwing in reshape" begin 85 | @test_throws DimensionMismatch reshape(D,(100,100)) 86 | end 87 | 88 | DR = reshape(D,(100,400)) 89 | @testset "Test reshape" begin 90 | @test size(DR) == (100,400) 91 | end 92 | close(D) 93 | end 94 | 95 | check_leaks(t) 96 | 97 | t=@testset "test @DArray comprehension constructor" begin 98 | 99 | @testset "test valid use of @DArray" begin 100 | D = @DArray [i+j for i=1:10, j=1:10] 101 | @test D == [i+j for i=1:10, j=1:10] 102 | close(D) 103 | end 104 | 105 | @testset "test invalid use of @DArray" begin 106 | @test_throws ArgumentError eval(:((@DArray [1,2,3,4]))) 107 | end 108 | end 109 | 110 | check_leaks(t) 111 | 112 | t=@testset "test DArray / Array conversion" begin 113 | D = drand((200,200), [MYID, OTHERIDS]) 114 | 115 | @testset "test convert(::Array, ::(Sub)DArray)" begin 116 | S = convert(Matrix{Float64}, D[1:150, 1:150]) 117 | A = convert(Matrix{Float64}, D) 118 | 119 | @test A[1:150,1:150] == S 120 | D2 = convert(DArray{Float64,2,Matrix{Float64}}, A) 121 | @test D2 == D 122 | @test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1] 123 | @test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101] 124 | close(D2) 125 | 126 | S2 = convert(Vector{Float64}, D[4, 23:176]) 127 | @test A[4, 23:176] == S2 128 | 129 | S3 = convert(Vector{Float64}, D[23:176, 197]) 130 | @test A[23:176, 197] == S3 131 | 132 | S4 = zeros(4) 133 | setindex!(S4, D[3:4, 99:100], :) 134 | @test S4 == vec(D[3:4, 99:100]) 135 | @test S4 == vec(A[3:4, 99:100]) 136 | 137 | S5 = zeros(2,2) 138 | setindex!(S5, D[1,1:4], :, 1:2) 139 | @test vec(S5) == D[1, 1:4] 140 | @test vec(S5) == A[1, 1:4] 141 | end 142 | close(D) 143 | end 144 | 145 | check_leaks(t) 146 | 147 | t=@testset "copy!" begin 148 | D1 = dzeros((10,10)) 149 | r1 = remotecall_wait(() -> randn(3,10), workers()[1]) 150 | r2 = remotecall_wait(() -> randn(7,10), workers()[2]) 151 | D2 = DArray(reshape([r1; r2], 2, 1)) 152 | copy!(D2, D1) 153 | @test D1 == D2 154 | close(D1) 155 | close(D2) 156 | end 157 | 158 | check_leaks(t) 159 | 160 | t=@testset "test DArray reduce" begin 161 | D = DArray(id->fill(myid(), map(length,id)), (10,10), [MYID, OTHERIDS]) 162 | 163 | @testset "test reduce" begin 164 | @test reduce(+, D) == ((50*MYID) + (50*OTHERIDS)) 165 | end 166 | 167 | @testset "test map / reduce" begin 168 | D2 = map(x->1, D) 169 | @test reduce(+, D2) == 100 170 | close(D2) 171 | end 172 | 173 | @testset "test map! / reduce" begin 174 | map!(x->1, D, D) 175 | @test reduce(+, D) == 100 176 | end 177 | close(D) 178 | end 179 | 180 | check_leaks(t) 181 | 182 | t=@testset "test scale" begin 183 | A = randn(100,100) 184 | DA = distribute(A) 185 | @test scale!(DA, 2) == scale!(A, 2) 186 | close(DA) 187 | end 188 | 189 | check_leaks(t) 190 | 191 | t=@testset "test scale!(b, A)" begin 192 | A = randn(100, 100) 193 | b = randn(100) 194 | DA = distribute(A) 195 | @test scale!(b, A) == scale!(b, DA) 196 | close(DA) 197 | A = randn(100, 100) 198 | b = randn(100) 199 | DA = distribute(A) 200 | @test scale!(A, b) == scale!(DA, b) 201 | close(DA) 202 | end 203 | 204 | check_leaks(t) 205 | 206 | t=@testset "test mapreduce on DArrays" begin 207 | for _ = 1:25, f = [x -> Int128(2x), x -> Int128(x^2), x -> Int128(x^2 + 2x - 1)], opt = [+, *] 208 | A = rand(1:5, rand(2:30)) 209 | DA = distribute(A) 210 | @test mapreduce(f, opt, DA) - mapreduce(f, opt, A) == 0 211 | close(DA) 212 | end 213 | end 214 | 215 | check_leaks(t) 216 | 217 | t=@testset "test mapreducedim on DArrays" begin 218 | D = DArray(I->fill(myid(), map(length,I)), (73,73), [MYID, OTHERIDS]) 219 | D2 = map(x->1, D) 220 | @test mapreducedim(t -> t*t, +, D2, 1) == mapreducedim(t -> t*t, +, convert(Array, D2), 1) 221 | @test mapreducedim(t -> t*t, +, D2, 2) == mapreducedim(t -> t*t, +, convert(Array, D2), 2) 222 | @test mapreducedim(t -> t*t, +, D2, (1,2)) == mapreducedim(t -> t*t, +, convert(Array, D2), (1,2)) 223 | 224 | # Test non-regularly chunked DArrays 225 | r1 = DistributedArrays.remotecall(() -> sprandn(3, 10, 0.1), workers()[1]) 226 | r2 = DistributedArrays.remotecall(() -> sprandn(7, 10, 0.1), workers()[2]) 227 | D = DArray(reshape([r1; r2], (2,1))) 228 | @test Array(sum(D, 2)) == sum(Array(D), 2) 229 | 230 | # close(D) 231 | # close(D2) 232 | d_closeall() # temp created by the mapreduce above 233 | end 234 | 235 | check_leaks(t) 236 | 237 | t=@testset "test mapreducdim, reducedim on DArrays" begin 238 | dims = (20,20,20) 239 | DA = drandn(dims) 240 | A = convert(Array, DA) 241 | 242 | @testset "dimension $dms" for dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3)) 243 | @test mapreducedim(t -> t*t, +, A, dms) ≈ mapreducedim(t -> t*t, +, DA, dms) 244 | @test mapreducedim(t -> t*t, +, A, dms, 1.0) ≈ mapreducedim(t -> t*t, +, DA, dms, 1.0) 245 | @test reducedim(*, A, dms) ≈ reducedim(*, DA, dms) 246 | @test reducedim(*, A, dms, 2.0) ≈ reducedim(*, DA, dms, 2.0) 247 | end 248 | close(DA) 249 | d_closeall() # temp created by the mapreduce above 250 | end 251 | 252 | check_leaks(t) 253 | 254 | t=@testset "test statistical functions on DArrays" begin 255 | dims = (20,20,20) 256 | DA = drandn(dims) 257 | A = convert(Array, DA) 258 | 259 | @testset "test $f for dimension $dms" for f in (mean, ), dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3)) 260 | # std is pending implementation 261 | @test f(DA,dms) ≈ f(A,dms) 262 | end 263 | 264 | close(DA) 265 | d_closeall() # temporaries created above 266 | end 267 | 268 | check_leaks(t) 269 | 270 | t=@testset "test sum on DArrays" begin 271 | A = randn(100,100) 272 | DA = distribute(A) 273 | 274 | # sum either throws an ArgumentError or a CompositeException of ArgumentErrors 275 | try 276 | sum(DA, -1) 277 | catch err 278 | if isa(err, CompositeException) 279 | @test !isempty(err.exceptions) 280 | for excep in err.exceptions 281 | # Unpack the remote exception 282 | orig_err = excep.ex.captured.ex 283 | @test isa(orig_err, ArgumentError) 284 | end 285 | else 286 | @test isa(err, ArgumentError) 287 | end 288 | end 289 | try 290 | sum(DA, 0) 291 | catch err 292 | if isa(err, CompositeException) 293 | @test !isempty(err.exceptions) 294 | for excep in err.exceptions 295 | # Unpack the remote exception 296 | orig_err = excep.ex.captured.ex 297 | @test isa(orig_err, ArgumentError) 298 | end 299 | else 300 | @test isa(err, ArgumentError) 301 | end 302 | end 303 | 304 | @test sum(DA) ≈ sum(A) 305 | @test sum(DA,1) ≈ sum(A,1) 306 | @test sum(DA,2) ≈ sum(A,2) 307 | @test sum(DA,3) ≈ sum(A,3) 308 | close(DA) 309 | d_closeall() # temporaries created above 310 | end 311 | 312 | check_leaks(t) 313 | 314 | t=@testset "test size on DArrays" begin 315 | 316 | A = randn(100,100) 317 | DA = distribute(A) 318 | 319 | @test_throws BoundsError size(DA, 0) 320 | @test size(DA,1) == size(A,1) 321 | @test size(DA,2) == size(A,2) 322 | @test size(DA,3) == size(A,3) 323 | close(DA) 324 | end 325 | 326 | check_leaks(t) 327 | 328 | # test length / endof 329 | t=@testset "test collections API" begin 330 | A = randn(23,23) 331 | DA = distribute(A) 332 | 333 | @testset "test length" begin 334 | @test length(DA) == length(A) 335 | end 336 | 337 | @testset "test endof" begin 338 | @test endof(DA) == endof(A) 339 | end 340 | close(DA) 341 | end 342 | 343 | check_leaks(t) 344 | 345 | t=@testset "test max / min / sum" begin 346 | a = map(x -> Int(round(rand() * 100)) - 50, Array{Int}(100,1000)) 347 | d = distribute(a) 348 | 349 | @test sum(d) == sum(a) 350 | @test maximum(d) == maximum(a) 351 | @test minimum(d) == minimum(a) 352 | @test maximum(abs, d) == maximum(abs, a) 353 | @test minimum(abs, d) == minimum(abs, a) 354 | @test sum(abs, d) == sum(abs, a) 355 | @test sum(abs2, d) == sum(abs2, a) 356 | close(d) 357 | end 358 | 359 | check_leaks(t) 360 | 361 | t=@testset "test all / any" begin 362 | a = map(x->Int(round(rand() * 100)) - 50, Array{Int}(100,1000)) 363 | a = [true for i in 1:100] 364 | d = distribute(a) 365 | 366 | @test all(d) 367 | @test any(d) 368 | 369 | close(d) 370 | 371 | a[50] = false 372 | d = distribute(a) 373 | @test !all(d) 374 | @test any(d) 375 | 376 | close(d) 377 | 378 | a = [false for i in 1:100] 379 | d = distribute(a) 380 | @test !all(d) 381 | @test !any(d) 382 | 383 | close(d) 384 | 385 | d = dones(10,10) 386 | @test !all(x-> x>1.0, d) 387 | @test all(x-> x>0.0, d) 388 | 389 | close(d) 390 | 391 | a = ones(10,10) 392 | a[10] = 2.0 393 | d = distribute(a) 394 | @test any(x-> x == 1.0, d) 395 | @test any(x-> x == 2.0, d) 396 | @test !any(x-> x == 3.0, d) 397 | 398 | close(d) 399 | end 400 | 401 | check_leaks(t) 402 | 403 | t=@testset "test count" begin 404 | a = ones(10,10) 405 | a[10] = 2.0 406 | d = distribute(a) 407 | 408 | @test count(x-> x == 2.0, d) == 1 409 | @test count(x-> x == 1.0, d) == 99 410 | @test count(x-> x == 0.0, d) == 0 411 | 412 | close(d) 413 | end 414 | 415 | check_leaks(t) 416 | 417 | t=@testset "test prod" begin 418 | a = fill(2, 10); 419 | d = distribute(a); 420 | @test prod(d) == 2^10 421 | 422 | close(d) 423 | end 424 | 425 | check_leaks(t) 426 | 427 | t=@testset "test zeros" begin 428 | @testset "1D dzeros default element type" begin 429 | A = dzeros(10) 430 | @test A == zeros(10) 431 | @test eltype(A) == Float64 432 | @test size(A) == (10,) 433 | close(A) 434 | end 435 | 436 | @testset "1D dzeros with specified element type" begin 437 | A = dzeros(Int, 10) 438 | @test A == zeros(10) 439 | @test eltype(A) == Int 440 | @test size(A) == (10,) 441 | close(A) 442 | end 443 | 444 | @testset "2D dzeros default element type, Dims constuctor" begin 445 | A = dzeros((10,10)) 446 | @test A == zeros((10,10)) 447 | @test eltype(A) == Float64 448 | @test size(A) == (10,10) 449 | close(A) 450 | end 451 | 452 | @testset "2D dzeros specified element type, Dims constructor" begin 453 | A = dzeros(Int, (10,10)) 454 | @test A == zeros(Int, (10,10)) 455 | @test eltype(A) == Int 456 | @test size(A) == (10,10) 457 | close(A) 458 | end 459 | 460 | @testset "2D dzeros, default element type" begin 461 | A = dzeros(10,10) 462 | @test A == zeros(10,10) 463 | @test eltype(A) == Float64 464 | @test size(A) == (10,10) 465 | close(A) 466 | end 467 | 468 | @testset "2D dzeros, specified element type" begin 469 | A = dzeros(Int, 10, 10) 470 | @test A == zeros(Int, 10, 10) 471 | @test eltype(A) == Int 472 | @test size(A) == (10,10) 473 | close(A) 474 | end 475 | end 476 | 477 | check_leaks(t) 478 | 479 | t=@testset "test dones" begin 480 | @testset "1D dones default element type" begin 481 | A = dones(10) 482 | @test A == ones(10) 483 | @test eltype(A) == Float64 484 | @test size(A) == (10,) 485 | close(A) 486 | end 487 | 488 | @testset "1D dones with specified element type" begin 489 | A = dones(Int, 10) 490 | @test eltype(A) == Int 491 | @test size(A) == (10,) 492 | close(A) 493 | end 494 | 495 | @testset "2D dones default element type, Dims constuctor" begin 496 | A = dones((10,10)) 497 | @test A == ones((10,10)) 498 | @test eltype(A) == Float64 499 | @test size(A) == (10,10) 500 | close(A) 501 | end 502 | 503 | @testset "2D dones specified element type, Dims constructor" begin 504 | A = dones(Int, (10,10)) 505 | @test A == ones(Int, (10,10)) 506 | @test eltype(A) == Int 507 | @test size(A) == (10,10) 508 | close(A) 509 | end 510 | 511 | @testset "2D dones, default element type" begin 512 | A = dones(10,10) 513 | @test A == ones(10,10) 514 | @test eltype(A) == Float64 515 | @test size(A) == (10,10) 516 | close(A) 517 | end 518 | 519 | @testset "2D dones, specified element type" begin 520 | A = dones(Int, 10, 10) 521 | @test A == ones(Int, 10, 10) 522 | @test eltype(A) == Int 523 | @test size(A) == (10,10) 524 | close(A) 525 | end 526 | end 527 | 528 | check_leaks(t) 529 | 530 | t=@testset "test drand" begin 531 | @testset "1D drand" begin 532 | A = drand(100) 533 | @test eltype(A) == Float64 534 | @test size(A) == (100,) 535 | @test all(x-> x >= 0.0 && x <= 1.0, A) 536 | close(A) 537 | end 538 | 539 | @testset "1D drand, specified element type" begin 540 | A = drand(Int, 100) 541 | @test eltype(A) == Int 542 | @test size(A) == (100,) 543 | close(A) 544 | end 545 | 546 | @testset "1D drand, UnitRange" begin 547 | A = drand(1:10, 100) 548 | @test eltype(A) == Int 549 | @test size(A) == (100,) 550 | close(A) 551 | end 552 | 553 | @testset "1D drand, Array" begin 554 | A = drand([-1,0,1], 100) 555 | @test eltype(A) == Int 556 | @test size(A) == (100,) 557 | close(A) 558 | end 559 | 560 | @testset "2D drand, Dims constructor" begin 561 | A = drand((50,50)) 562 | @test eltype(A) == Float64 563 | @test size(A) == (50,50) 564 | @test all(x-> x >= 0.0 && x <= 1.0, A) 565 | close(A) 566 | end 567 | 568 | @testset "2D drand" begin 569 | A = drand(100,100) 570 | @test eltype(A) == Float64 571 | @test size(A) == (100,100) 572 | @test all(x-> x >= 0.0 && x <= 1.0, A) 573 | close(A) 574 | end 575 | 576 | @testset "2D drand, Dims constructor, specified element type" begin 577 | A = drand(Int, (100,100)) 578 | @test eltype(A) == Int 579 | @test size(A) == (100,100) 580 | close(A) 581 | end 582 | 583 | @testset "2D drand, specified element type" begin 584 | A = drand(Int, 100, 100) 585 | @test eltype(A) == Int 586 | @test size(A) == (100,100) 587 | close(A) 588 | end 589 | end 590 | 591 | check_leaks(t) 592 | 593 | t=@testset "test randn" begin 594 | @testset "1D drandn" begin 595 | A = drandn(100) 596 | @test eltype(A) == Float64 597 | @test size(A) == (100,) 598 | close(A) 599 | end 600 | 601 | @testset "2D drandn, Dims constructor" begin 602 | A = drandn((50,50)) 603 | @test eltype(A) == Float64 604 | @test size(A) == (50,50) 605 | close(A) 606 | end 607 | 608 | @testset "2D drandn" begin 609 | A = drandn(100,100) 610 | @test eltype(A) == Float64 611 | @test size(A) == (100,100) 612 | close(A) 613 | end 614 | end 615 | 616 | check_leaks(t) 617 | 618 | t=@testset "test c/transpose" begin 619 | @testset "test ctranspose real" begin 620 | A = drand(Float64, 100, 200) 621 | @test A' == Array(A)' 622 | close(A) 623 | end 624 | @testset "test ctranspose complex" begin 625 | A = drand(Complex128, 200, 100) 626 | @test A' == Array(A)' 627 | close(A) 628 | end 629 | @testset "test transpose real" begin 630 | A = drand(Float64, 200, 100) 631 | @test A.' == Array(A).' 632 | close(A) 633 | end 634 | @testset "test ctranspose complex" begin 635 | A = drand(Complex128, 100, 200) 636 | @test A.' == Array(A).' 637 | close(A) 638 | end 639 | 640 | d_closeall() # close the temporaries created above 641 | end 642 | 643 | check_leaks(t) 644 | 645 | t=@testset "test convert from subdarray" begin 646 | a = drand(20, 20); 647 | 648 | s = view(a, 1:5, 5:8) 649 | @test isa(s, SubDArray) 650 | @test s == convert(DArray, s) 651 | 652 | s = view(a, 6:5, 5:8) 653 | @test isa(s, SubDArray) 654 | @test s == convert(DArray, s) 655 | close(a) 656 | d_closeall() # close the temporaries created above 657 | end 658 | 659 | check_leaks(t) 660 | 661 | t=@testset "test scalar math" begin 662 | a = drand(20, 20); 663 | b = convert(Array, a) 664 | @testset "$f" for f in (-, abs, abs2, acos, acosd, acot, 665 | acotd, acsch, angle, asech, asin, 666 | asind, asinh, atan, atand, atanh, 667 | big, cbrt, ceil, cis, complex, conj, 668 | cos, cosc, cosd, cosh, cospi, cot, 669 | cotd, coth, csc, cscd, csch, dawson, 670 | deg2rad, digamma, erf, erfc, erfcinv, 671 | erfcx, erfi, erfinv, exp, exp10, exp2, 672 | expm1, exponent, float, floor, gamma, imag, 673 | invdigamma, isfinite, isinf, isnan, lfact, 674 | lgamma, log, log10, log1p, log2, rad2deg, real, 675 | sec, secd, sech, sign, sin, sinc, sind, 676 | sinh, sinpi, sqrt, tan, tand, tanh, trigamma) 677 | @test f.(a) == f.(b) 678 | end 679 | a = a + 1 680 | b = b + 1 681 | @testset "$f" for f in (asec, asecd, acosh, acsc, acscd, acoth) 682 | @test f.(a) == f.(b) 683 | end 684 | close(a) 685 | d_closeall() # close the temporaries created above 686 | end 687 | 688 | check_leaks(t) 689 | 690 | t=@testset "test mapslices" begin 691 | A = randn(5,5,5) 692 | D = distribute(A, procs = workers(), dist = [1, 1, min(nworkers(), 5)]) 693 | @test mapslices(svdvals, D, (1,2)) ≈ mapslices(svdvals, A, (1,2)) 694 | @test mapslices(svdvals, D, (1,3)) ≈ mapslices(svdvals, A, (1,3)) 695 | @test mapslices(svdvals, D, (2,3)) ≈ mapslices(svdvals, A, (2,3)) 696 | @test mapslices(sort, D, (1,)) ≈ mapslices(sort, A, (1,)) 697 | @test mapslices(sort, D, (2,)) ≈ mapslices(sort, A, (2,)) 698 | @test mapslices(sort, D, (3,)) ≈ mapslices(sort, A, (3,)) 699 | 700 | # issue #3613 701 | B = mapslices(sum, dones(Float64, (2,3,4), workers(), [1,1,min(nworkers(),4)]), [1,2]) 702 | @test size(B) == (1,1,4) 703 | @test all(B.==6) 704 | 705 | # issue #5141 706 | C1 = mapslices(x-> maximum(-x), D, []) 707 | @test C1 == -D 708 | 709 | # issue #5177 710 | c = dones(Float64, (2,3,4,5), workers(), [1,1,1,min(nworkers(),5)]) 711 | m1 = mapslices(x-> ones(2,3), c, [1,2]) 712 | m2 = mapslices(x-> ones(2,4), c, [1,3]) 713 | m3 = mapslices(x-> ones(3,4), c, [2,3]) 714 | @test size(m1) == size(m2) == size(m3) == size(c) 715 | 716 | n1 = mapslices(x-> ones(6), c, [1,2]) 717 | n2 = mapslices(x-> ones(6), c, [1,3]) 718 | n3 = mapslices(x-> ones(6), c, [2,3]) 719 | n1a = mapslices(x-> ones(1,6), c, [1,2]) 720 | n2a = mapslices(x-> ones(1,6), c, [1,3]) 721 | n3a = mapslices(x-> ones(1,6), c, [2,3]) 722 | @test (size(n1a) == (1,6,4,5) && size(n2a) == (1,3,6,5) && size(n3a) == (2,1,6,5)) 723 | @test (size(n1) == (6,1,4,5) && size(n2) == (6,3,1,5) && size(n3) == (2,6,1,5)) 724 | close(D) 725 | close(c) 726 | d_closeall() # close the temporaries created above 727 | end 728 | 729 | check_leaks(t) 730 | 731 | t=@testset "test scalar ops" begin 732 | a = drand(20,20) 733 | b = convert(Array, a) 734 | c = drand(20,20) 735 | d = convert(Array, c) 736 | 737 | @testset "$f" for f in (:+, :-, :.+, :.-, :.*, :./, :.%) 738 | x = rand() 739 | @test @eval ($f)($a, $x) == ($f)($b, $x) 740 | @test @eval ($f)($x, $a) == ($f)($x, $b) 741 | @test @eval ($f)($a, $c) == ($f)($b, $d) 742 | end 743 | 744 | close(a) 745 | close(c) 746 | 747 | a = dones(Int, 20, 20) 748 | b = convert(Array, a) 749 | @testset "$f" for f in (:.<<, :.>>) 750 | @test @eval ($f)($a, 2) == ($f)($b, 2) 751 | @test @eval ($f)(2, $a) == ($f)(2, $b) 752 | @test @eval ($f)($a, $a) == ($f)($b, $b) 753 | end 754 | 755 | @testset "$f" for f in (:rem,) 756 | x = rand() 757 | @test @eval ($f).($a, $x) == ($f).($b, $x) 758 | end 759 | close(a) 760 | close(c) 761 | d_closeall() # close the temporaries created above 762 | end 763 | 764 | check_leaks(t) 765 | 766 | t=@testset "test broadcast ops" begin 767 | wrkrs = workers() 768 | nwrkrs = length(wrkrs) 769 | nrows = 20 * nwrkrs 770 | ncols = 10 * nwrkrs 771 | a = drand((nrows,ncols), wrkrs, (1, nwrkrs)) 772 | m = mean(a, 1) 773 | c = a .- m 774 | d = convert(Array, a) .- convert(Array, m) 775 | @test c == d 776 | d_closeall() 777 | end 778 | 779 | check_leaks(t) 780 | 781 | t=@testset "test matrix multiplication" begin 782 | A = drandn(20,20) 783 | b = drandn(20) 784 | B = drandn(20,20) 785 | 786 | @test norm(convert(Array, A*b) - convert(Array, A)*convert(Array, b), Inf) < sqrt(eps()) 787 | @test norm(convert(Array, A*B) - convert(Array, A)*convert(Array, B), Inf) < sqrt(eps()) 788 | @test norm(convert(Array, A'*b) - convert(Array, A)'*convert(Array, b), Inf) < sqrt(eps()) 789 | @test norm(convert(Array, A'*B) - convert(Array, A)'*convert(Array, B), Inf) < sqrt(eps()) 790 | close(A) 791 | close(b) 792 | close(B) 793 | d_closeall() # close the temporaries created above 794 | end 795 | 796 | check_leaks(t) 797 | 798 | t=@testset "test norm" begin 799 | x = drandn(20) 800 | 801 | @test abs(norm(x) - norm(convert(Array, x))) < sqrt(eps()) 802 | @test abs(norm(x, 1) - norm(convert(Array, x), 1)) < sqrt(eps()) 803 | @test abs(norm(x, 2) - norm(convert(Array, x), 2)) < sqrt(eps()) 804 | @test abs(norm(x, Inf) - norm(convert(Array, x), Inf)) < sqrt(eps()) 805 | close(x) 806 | end 807 | 808 | check_leaks(t) 809 | 810 | t=@testset "test axpy!" begin 811 | x = drandn(20) 812 | y = drandn(20) 813 | 814 | @test norm(convert(Array, LinAlg.axpy!(2.0, x, copy(y))) - LinAlg.axpy!(2.0, convert(Array, x), convert(Array, y))) < sqrt(eps()) 815 | @test_throws DimensionMismatch LinAlg.axpy!(2.0, x, zeros(length(x) + 1)) 816 | close(x) 817 | close(y) 818 | d_closeall() # close the temporaries created above 819 | end 820 | 821 | check_leaks(t) 822 | 823 | t=@testset "test ppeval" begin 824 | A = drandn((10, 10, nworkers()), workers(), [1, 1, nworkers()]) 825 | B = drandn((10, nworkers()), workers(), [1, nworkers()]) 826 | 827 | R = zeros(10, nworkers()) 828 | for i = 1:nworkers() 829 | R[:, i] = convert(Array, A)[:, :, i]*convert(Array, B)[:, i] 830 | end 831 | @test convert(Array, ppeval(*, A, B)) ≈ R 832 | @test sum(ppeval(eigvals, A)) ≈ sum(ppeval(eigvals, A, eye(10, 10))) 833 | close(A) 834 | close(B) 835 | d_closeall() # close the temporaries created above 836 | end 837 | 838 | check_leaks(t) 839 | 840 | t=@testset "test nnz" begin 841 | A = sprandn(10, 10, 0.5) 842 | @test nnz(distribute(A)) == nnz(A) 843 | end 844 | 845 | t=@testset "test matmatmul" begin 846 | A = drandn(30, 30) 847 | B = drandn(30, 20) 848 | a = convert(Array, A) 849 | b = convert(Array, B) 850 | 851 | AB = A * B 852 | AtB = A.' * B 853 | AcB = A' * B 854 | 855 | ab = a * b 856 | atb = a.' * b 857 | acb = a' * b 858 | 859 | @test AB ≈ ab 860 | @test AtB ≈ atb 861 | @test AcB ≈ acb 862 | d_closeall() # close the temporaries created above 863 | end 864 | 865 | t=@testset "sort, T = $T" for i in 0:6, T in [Int, Float64] 866 | d = DistributedArrays.drand(T, 10^i) 867 | @testset "sample = $sample" for sample in Any[true, false, (minimum(d),maximum(d)), rand(T, 10^i>512 ? 512 : 10^i)] 868 | d2 = DistributedArrays.sort(d; sample=sample) 869 | 870 | @test length(d) == length(d2) 871 | @test sort(convert(Array, d)) == convert(Array, d2) 872 | end 873 | d_closeall() # close the temporaries created above 874 | end 875 | 876 | check_leaks(t) 877 | 878 | t=@testset "ddata" begin 879 | d = ddata(;T=Int, init=I->myid()) 880 | for p in workers() 881 | @test p == remotecall_fetch(d->d[:L], p, d) 882 | end 883 | @test Int[workers()...] == gather(d) 884 | 885 | close(d) 886 | 887 | d = ddata(;T=Int, data=workers()) 888 | for p in workers() 889 | @test p == remotecall_fetch(d->d[:L], p, d) 890 | end 891 | @test Int[workers()...] == gather(d) 892 | 893 | close(d) 894 | 895 | d = ddata(;T=Any, init=I->"Hello World!") 896 | for p in workers() 897 | @test "Hello World!" == remotecall_fetch(d->d[:L], p, d) 898 | end 899 | Any["Hello World!" for p in workers()] == gather(d) 900 | 901 | 902 | close(d) 903 | end 904 | 905 | check_leaks(t) 906 | 907 | d_closeall() 908 | 909 | t=@testset "test for any leaks" begin 910 | sleep(1.0) # allow time for any cleanup to complete 911 | allrefszero = Bool[remotecall_fetch(()->length(DistributedArrays.refs) == 0, p) for p in procs()] 912 | @test all(allrefszero) 913 | 914 | allregistrieszero = Bool[remotecall_fetch(()->length(DistributedArrays.registry) == 0, p) for p in procs()] 915 | @test all(allregistrieszero) 916 | end 917 | 918 | --------------------------------------------------------------------------------