├── REQUIRE
├── codecov.yml
├── .gitignore
├── .travis.yml
├── src
    ├── DistributedArrays.jl
    ├── core.jl
    ├── serialize.jl
    ├── sort.jl
    ├── spmd.jl
    ├── linalg.jl
    ├── mapreduce.jl
    └── darray.jl
├── LICENSE.md
├── test
    ├── runtests.jl
    ├── spmd.jl
    └── darray.jl
└── README.md


/REQUIRE:
--------------------------------------------------------------------------------
1 | julia 0.6-
2 | Primes
3 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 |  comment: off
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jl.cov
2 | *.jl.mem
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: julia
 2 | os:
 3 |   - linux
 4 |   - osx
 5 | julia:
 6 |   - nightly
 7 | matrix:
 8 |   # allow_failures:
 9 |     # - julia: nightly
10 | notifications:
11 |   email: false
12 | before_install:
13 |   - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
14 | after_success:
15 |   - julia -e 'cd(Pkg.dir("DistributedArrays")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder()); Codecov.submit(Codecov.process_folder())'
16 | 


--------------------------------------------------------------------------------
/src/DistributedArrays.jl:
--------------------------------------------------------------------------------
 1 | __precompile__(true)
 2 | 
 3 | module DistributedArrays
 4 | 
 5 | using Primes
 6 | using Primes: factor
 7 | 
 8 | importall Base
 9 | import Base.Callable
10 | import Base.BLAS: axpy!
11 | 
12 | # DArray exports
13 | export (.+), (.-), (.*), (./), (.%), (.<<), (.>>), div, mod, rem, (&), (|), ($)
14 | export DArray, SubDArray, SubOrDArray, @DArray
15 | export dzeros, dones, dfill, drand, drandn, distribute, localpart, localindexes, ppeval, samedist
16 | 
17 | # non-array distributed data
18 | export ddata, gather
19 | 
20 | # immediate release of localparts
21 | export close, d_closeall
22 | 
23 | include("darray.jl")
24 | include("core.jl")
25 | include("serialize.jl")
26 | include("mapreduce.jl")
27 | include("linalg.jl")
28 | include("sort.jl")
29 | 
30 | include("spmd.jl")
31 | export SPMD
32 | 
33 | end # module
34 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The DistributedArrays.jl package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2015: Julia Parallel Contributors
 4 | >
 5 | > Permission is hereby granted, free of charge, to any person obtaining
 6 | > a copy of this software and associated documentation files (the
 7 | > "Software"), to deal in the Software without restriction, including
 8 | > without limitation the rights to use, copy, modify, merge, publish,
 9 | > distribute, sublicense, and/or sell copies of the Software, and to
10 | > permit persons to whom the Software is furnished to do so, subject to
11 | > the following conditions:
12 | >
13 | > The above copyright notice and this permission notice shall be
14 | > included in all copies or substantial portions of the Software.
15 | >
16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
 1 | using Base.Test
 2 | 
 3 | using DistributedArrays
 4 | 
 5 | # add at least 3 worker processes
 6 | if nworkers() < 3
 7 |     n = max(3, min(8, Sys.CPU_CORES))
 8 |     addprocs(n; exeflags=`--check-bounds=yes`)
 9 | end
10 | @assert nprocs() > 3
11 | @assert nworkers() >= 3
12 | 
13 | @everywhere importall DistributedArrays
14 | @everywhere importall DistributedArrays.SPMD
15 | 
16 | @everywhere srand(1234 + myid())
17 | 
18 | const MYID = myid()
19 | const OTHERIDS = filter(id-> id != MYID, procs())[rand(1:(nprocs()-1))]
20 | 
21 | # On 0.6, @testset does not display the test description automatically anymore.
22 | function print_test_desc(t, n=0)
23 |     println(repeat(" ", n), "Passed : ", t.description)
24 |     for t2 in t.results
25 |         if isa(t2, Base.Test.DefaultTestSet)
26 |             print_test_desc(t2, n+2)
27 |         end
28 |     end
29 | end
30 | 
31 | function check_leaks(t=nothing)
32 |     if length(DistributedArrays.refs) > 0
33 |         sleep(0.1)  # allow time for any cleanup to complete and test again
34 |         length(DistributedArrays.refs) > 0 && warn("Probable leak of ", length(DistributedArrays.refs), " darrays")
35 |     end
36 | 
37 |     isa(t, Base.Test.DefaultTestSet) && print_test_desc(t)
38 | end
39 | 
40 | include("darray.jl")
41 | include("spmd.jl")
42 | 
43 | 


--------------------------------------------------------------------------------
/src/core.jl:
--------------------------------------------------------------------------------
 1 | const registry=Dict{Tuple, Any}()
 2 | const refs=Set()  # Collection of darray identities created on this node
 3 | 
 4 | let DID::Int = 1
 5 |     global next_did
 6 |     next_did() = (id = DID; DID += 1; (myid(), id))
 7 | end
 8 | 
 9 | """
10 |     next_did()
11 | 
12 | Produces an incrementing ID that will be used for DArrays.
13 | """
14 | next_did
15 | 
16 | release_localpart(id::Tuple) = (delete!(registry, id); nothing)
17 | release_localpart(d) = release_localpart(d.id)
18 | 
19 | function close_by_id(id, pids)
20 | #   @schedule println("Finalizer for : ", id)
21 |     global refs
22 |     @sync begin
23 |         for p in pids
24 |             @async remotecall_fetch(release_localpart, p, id)
25 |         end
26 |         if !(myid() in pids)
27 |             release_localpart(id)
28 |         end
29 |     end
30 |     delete!(refs, id)
31 |     nothing
32 | end
33 | 
34 | function close(d::DArray)
35 | #    @schedule println("close : ", d.id, ", object_id : ", object_id(d), ", myid : ", myid() )
36 |     if (myid() == d.id[1]) && d.release
37 |         @schedule close_by_id(d.id, d.pids)
38 |         d.release = false
39 |     end
40 |     nothing
41 | end
42 | 
43 | function d_closeall()
44 |     crefs = copy(refs)
45 |     for id in crefs
46 |         if id[1] ==  myid() # sanity check
47 |             haskey(registry, id) && close(registry[id])
48 |             yield()
49 |         end
50 |     end
51 | end
52 | 
53 | """
54 |     procs(d::DArray)
55 | 
56 | Get the vector of processes storing pieces of DArray `d`.
57 | """
58 | Base.procs(d::DArray) = d.pids
59 | 
60 | """
61 |     localpart(A)
62 | 
63 | The identity when input is not distributed
64 | """
65 | localpart(A) = A
66 | 
67 | 


--------------------------------------------------------------------------------
/src/serialize.jl:
--------------------------------------------------------------------------------
 1 | function Base.serialize{T,N,A}(S::AbstractSerializer, d::DArray{T,N,A})
 2 |     # Only send the ident for participating workers - we expect the DArray to exist in the
 3 |     # remote registry. DO NOT send the localpart.
 4 |     destpid = Base.worker_id_from_socket(S.io)
 5 |     Serializer.serialize_type(S, typeof(d))
 6 |     if (destpid in d.pids) || (destpid == d.id[1])
 7 |         serialize(S, (true, d.id))    # (id_only, id)
 8 |     else
 9 |         serialize(S, (false, d.id))
10 |         for n in [:dims, :pids, :indexes, :cuts]
11 |             serialize(S, getfield(d, n))
12 |         end
13 |         serialize(S, A)
14 |     end
15 | end
16 | 
17 | function Base.deserialize{DT<:DArray}(S::AbstractSerializer, t::Type{DT})
18 |     what = deserialize(S)
19 |     id_only = what[1]
20 |     id = what[2]
21 | 
22 |     if id_only
23 |         if haskey(registry, id)
24 |             return registry[id]
25 |         else
26 |             # access to fields will throw an error, at least the deserialization process will not
27 |             # result in worker death
28 |             d = DT()
29 |             d.id = id
30 |             return d
31 |         end
32 |     else
33 |         # We are not a participating worker, deser fields and instantiate locally.
34 |         dims = deserialize(S)
35 |         pids = deserialize(S)
36 |         indexes = deserialize(S)
37 |         cuts = deserialize(S)
38 |         A = deserialize(S)
39 |         T=eltype(DT)
40 |         N=length(dims)
41 |         return DT(id, dims, pids, indexes, cuts, empty_localpart(T,N,A))
42 |     end
43 | end
44 | 
45 | # Serialize only those parts of the object as required by the destination worker.
46 | type DestinationSerializer
47 |     generate::Nullable{Function}     # Function to generate the part to be serialized
48 |     pids::Nullable{Array}            # MUST have the same shape as the distribution
49 | 
50 |     deser_obj::Nullable{Any}         # Deserialized part
51 | 
52 |     DestinationSerializer(f,p,d) = new(f,p,d)
53 | end
54 | 
55 | DestinationSerializer(f::Function, pids::Array) = DestinationSerializer(f, pids, Nullable{Any}())
56 | 
57 | # contructs a DestinationSerializer after verifying that the shape of pids.
58 | function verified_destination_serializer(f::Function, pids::Array, verify_size)
59 |     @assert size(pids) == verify_size
60 |     return DestinationSerializer(f, pids)
61 | end
62 | 
63 | DestinationSerializer(deser_obj::Any) = DestinationSerializer(Nullable{Function}(), Nullable{Array}(), deser_obj)
64 | 
65 | function Base.serialize(S::AbstractSerializer, s::DestinationSerializer)
66 |     pid = Base.worker_id_from_socket(S.io)
67 |     pididx = findfirst(get(s.pids), pid)
68 |     Serializer.serialize_type(S, typeof(s))
69 |     serialize(S, get(s.generate)(pididx))
70 | end
71 | 
72 | function Base.deserialize{T<:DestinationSerializer}(S::AbstractSerializer, t::Type{T})
73 |     lpart = deserialize(S)
74 |     return DestinationSerializer(lpart)
75 | end
76 | 
77 | 
78 | function localpart(s::DestinationSerializer)
79 |     if !isnull(s.deser_obj)
80 |         return get(s.deser_obj)
81 |     elseif  !isnull(s.generate) && (myid() in get(s.pids))
82 |         # Handle the special case where myid() is part of s.pids.
83 |         # In this case serialize/deserialize is not called as the remotecall is executed locally
84 |         return get(s.generate)(findfirst(get(s.pids), myid()))
85 |     else
86 |         throw(ErrorException(string("Invalid state in DestinationSerializer.")))
87 |     end
88 | end
89 | 


--------------------------------------------------------------------------------
/test/spmd.jl:
--------------------------------------------------------------------------------
  1 | @everywhere function spmd_test1()
  2 |     barrier(;tag=:b1)
  3 | 
  4 |     if myid() == 1
  5 |         @assert recvfrom(2) == "Hello from 2"
  6 |         println("SPMD: Passed send/recv")
  7 |     elseif myid() == 2
  8 |         data = "Hello from 2"
  9 |         sendto(1, data)
 10 |     end
 11 | 
 12 |     stime = rand(1:5)
 13 | #    println("Sleeping for $stime seconds")
 14 |     sleep(stime)
 15 |     barrier(;tag=:b2)
 16 | 
 17 |     bcast_val = nothing
 18 |     if myid() == 1
 19 |         bcast_val = rand(2)
 20 |     end
 21 | 
 22 |     bcast_val = bcast(bcast_val, 1)
 23 | 
 24 |     if myid() == 1
 25 |         @assert bcast_val == recvfrom(2)
 26 |         println("SPMD: Passed broadcast")
 27 |     elseif myid() == 2
 28 |         sendto(1, bcast_val)
 29 |     end
 30 | 
 31 |     barrier()
 32 | 
 33 |     scatter_data = nothing
 34 |     if myid() == 1
 35 |         scatter_data = rand(Int8, nprocs())
 36 |     end
 37 |     lp = scatter(scatter_data, 1, tag=1)
 38 | 
 39 |     if myid() == 1
 40 |         @assert scatter_data[2:2] == recvfrom(2)
 41 |         println("SPMD: Passed scatter 1")
 42 |     elseif myid() == 2
 43 |         sendto(1, lp)
 44 |     end
 45 | 
 46 |     scatter_data = nothing
 47 |     if myid() == 1
 48 |         scatter_data = rand(Int8, nprocs()*2)
 49 |     end
 50 |     lp = scatter(scatter_data, 1, tag=2)
 51 | 
 52 |     if myid() == 1
 53 |         @assert scatter_data[3:4] == recvfrom(2)
 54 |         println("SPMD: Passed scatter 2")
 55 |     elseif myid() == 2
 56 |         sendto(1, lp)
 57 |     end
 58 | 
 59 |     gathered_data = gather(myid(), 1, tag=3)
 60 |     if myid() == 1
 61 |         @assert gathered_data == procs()
 62 |         println("SPMD: Passed gather 1")
 63 |     end
 64 | 
 65 |     gathered_data = gather([myid(), myid()], 1, tag=4)
 66 |     if myid() == 1
 67 |         @assert gathered_data == [[p,p] for p in procs()]
 68 |         println("SPMD: Passed gather 2")
 69 |     end
 70 | end
 71 | 
 72 | spmd(spmd_test1)
 73 | 
 74 | # Test running only on the workers using the spmd function.
 75 | 
 76 | # define the function everywhere
 77 | @everywhere function foo_spmd(d_in, d_out, n)
 78 |     pids=sort(vec(procs(d_in)))
 79 |     pididx = findfirst(pids, myid())
 80 |     mylp = localpart(d_in)
 81 |     localsum = 0
 82 | 
 83 |     # Have each node exchange data with its neighbors
 84 |     n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
 85 |     p_pididx = pididx-1 < 1 ? length(pids) : pididx-1
 86 | 
 87 | #    println(p_pididx, " p", pids[p_pididx], " ", n_pididx, " p", pids[n_pididx])
 88 | #    println(mylp)
 89 | 
 90 |     for i in 1:n
 91 |         sendto(pids[n_pididx], mylp[2])
 92 |         sendto(pids[p_pididx], mylp[1])
 93 | 
 94 |         mylp[2] = recvfrom(pids[p_pididx])
 95 |         mylp[1] = recvfrom(pids[n_pididx])
 96 | 
 97 | #        println(mylp)
 98 | 
 99 |         barrier(;pids=pids)
100 |         localsum = localsum + mylp[1] + mylp[2]
101 |     end
102 | 
103 |     # finally store the sum in d_out
104 |     d_out[:L] = localsum
105 | end
106 | 
107 | # run foo_spmd on all workers, many of them, all concurrently using implictly different contexts.
108 | in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)
109 | out_arrays = map(x->ddata(), 1:8)
110 | 
111 | @sync for i in 1:8
112 |     @async spmd(foo_spmd, in_arrays[i], out_arrays[i], nworkers(); pids=workers())
113 | end
114 | for i in 1:8
115 |     @test Any[sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])
116 | end
117 | 
118 | println("SPMD: Passed testing of spmd function run concurrently")
119 | 
120 | # run concurrently with explictly different contexts
121 | 
122 | # define the function everywhere
123 | @everywhere function foo_spmd2(d_in, d_out, n)
124 |     pids=sort(vec(procs(d_in)))
125 |     pididx = findfirst(pids, myid())
126 |     mylp = localpart(d_in)
127 | 
128 |     # see if we have a value in the local store.
129 |     store = context_local_storage()
130 | 
131 |     localsum = get!(store, :LOCALSUM, 0)
132 | 
133 |     # Have each node exchange data with its neighbors
134 |     n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
135 |     p_pididx = pididx-1 < 1 ? length(pids) : pididx-1
136 | 
137 |     for i in 1:n
138 |         sendto(pids[n_pididx], mylp[2])
139 |         sendto(pids[p_pididx], mylp[1])
140 | 
141 |         mylp[2] = recvfrom(pids[p_pididx])
142 |         mylp[1] = recvfrom(pids[n_pididx])
143 | 
144 |         barrier(;pids=pids)
145 |         localsum = localsum + mylp[1] + mylp[2]
146 |     end
147 | 
148 |     # finally store the sum in d_out
149 |     d_out[:L] = localsum
150 |     store[:LOCALSUM] = localsum
151 | end
152 | 
153 | 
154 | in_arrays = map(x->DArray(I->fill(myid(), (map(length,I)...)), (nworkers(), 2), workers(), [nworkers(),1]), 1:8)
155 | out_arrays = map(x->ddata(), 1:8)
156 | contexts = map(x->context(workers()), 1:8)
157 | 
158 | @sync for i in 1:8
159 |     @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])
160 | end
161 | # Second run will add the value stored in the previous run.
162 | @sync for i in 1:8
163 |     @async spmd(foo_spmd2, in_arrays[i], out_arrays[i], nworkers(); pids=workers(), context=contexts[i])
164 | end
165 | 
166 | for i in 1:8
167 |     @test Any[2*sum(workers())*2 for i in 1:nworkers()] == gather(out_arrays[i])
168 | end
169 | 
170 | # verify localstores with appropriate context store values exist.
171 | @everywhere begin
172 |     if myid() != 1
173 |         n = 0
174 |         for (k,v) in DistributedArrays.SPMD.map_ctxts
175 |             store = v.store
176 |             localsum = store[:LOCALSUM]
177 |             if localsum != 2*sum(workers())*2
178 |                 println("localsum ", localsum, " != $(2*sum(workers())*2)")
179 |                 error("localsum mismatch")
180 |             end
181 |             n += 1
182 |         end
183 |         @assert n == 8
184 |     end
185 | end
186 | 
187 | # close the contexts
188 | foreach(x->close(x), contexts)
189 | 
190 | # verify that the localstores have been deleted.
191 | @everywhere begin
192 |     @assert isempty(DistributedArrays.SPMD.map_ctxts)
193 | end
194 | 
195 | println("SPMD: Passed spmd function with explicit context run concurrently")
196 | 
197 | 


--------------------------------------------------------------------------------
/src/sort.jl:
--------------------------------------------------------------------------------
  1 | # Sorting a DVector using samplesort
  2 | 
  3 | function sample_n_setup_ref(d::DVector, sample_size; kwargs...)
  4 |     lp = localpart(d)
  5 |     llp = length(lp)
  6 |     np = length(procs(d))
  7 |     sample_size = llp > sample_size ? sample_size : llp
  8 |     sorted = sort(lp; kwargs...)
  9 |     sample = sorted[collect(1:div(llp,sample_size):llp)]
 10 |     ref = RemoteChannel(()->Channel(np+1))             # To collect parts to be sorted locally later.
 11 |                                                        # First element is the locally sorted vector
 12 |     put!(ref, sorted)
 13 |     return (sample, ref)
 14 | end
 15 | 
 16 | 
 17 | function scatter_n_sort_localparts{T}(d, myidx, refs::Array{RemoteChannel}, boundaries::Array{T}; by = identity, kwargs...)
 18 |     if d==nothing
 19 |         sorted = take!(refs[myidx])  # First entry in the remote channel is sorted localpart
 20 |     else
 21 |         sorted = sort(localpart(d); by = by, kwargs...)
 22 |     end
 23 | 
 24 |     # send respective parts to correct workers, iterate over sorted array
 25 |     p_sorted = 1
 26 |     for (i,r) in enumerate(refs)
 27 |         p_till = length(sorted)+1
 28 | 
 29 |         # calculate range to send to refs[i]
 30 |         ctr=1
 31 |         for x in sorted[p_sorted:end]
 32 |             if by(x) > by(boundaries[i+1])
 33 |                 p_till = p_sorted+ctr-1
 34 |                 break
 35 |             else
 36 |                 ctr += 1
 37 |             end
 38 |         end
 39 | 
 40 |         if p_till == p_sorted
 41 |             @async put!(r, Array{T}(0))
 42 |         else
 43 |             v = sorted[p_sorted:p_till-1]
 44 |             @async put!(r, v)
 45 |         end
 46 | 
 47 |         p_sorted = p_till
 48 |     end
 49 | 
 50 |     # wait to receive all of my parts from all other workers
 51 |     lp_sorting=T[]
 52 |     for _ in refs
 53 |         v = take!(refs[myidx])
 54 |         append!(lp_sorting, v)
 55 |     end
 56 | 
 57 |     sorted_ref=RemoteChannel()
 58 |     put!(sorted_ref, sort!(lp_sorting; by = by, kwargs...))
 59 |     return (sorted_ref, length(lp_sorting))
 60 | end
 61 | 
 62 | function compute_boundaries{T}(d::DVector{T}; kwargs...)
 63 |     pids = procs(d)
 64 |     np = length(pids)
 65 |     sample_sz_on_wrkr = 512
 66 | 
 67 |     results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids)
 68 | 
 69 |     samples = Array{T}(0)
 70 |     for x in results
 71 |         append!(samples, x[1])
 72 |     end
 73 |     sort!(samples; kwargs...)
 74 |     samples[1] = typemin(T)
 75 | 
 76 |     refs=RemoteChannel[x[2] for x in results]
 77 | 
 78 |     boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]
 79 |     push!(boundaries, typemax(T))
 80 | 
 81 |     return (boundaries, refs)
 82 | end
 83 | 
 84 | """
 85 |     sort(d::DVector; sample=true, kwargs...) -> DVector
 86 | 
 87 | Sorts and returns a new distributed vector.
 88 | 
 89 | The sorted vector may not have the same distribution as the original.
 90 | 
 91 | Keyword argument `sample` can take values:
 92 | 
 93 | - `true`: A sample of max size 512 is first taken from all nodes. This is used to balance the distribution of the sorted array on participating workers. Default is `true`.
 94 | 
 95 | - `false`: No sampling is done. Assumes a uniform distribution between min(d) and max(d)
 96 | 
 97 | - 2-element tuple of the form `(min, max)`: No sampling is done. Assumes a uniform distribution between specified min and max values
 98 | 
 99 | - Array{T}: The passed array is assumed to be a sample of the distribution and is used to balance the sorted distribution.
100 | 
101 | Keyword argument `alg` takes the same options `Base.sort`
102 | """
103 | function Base.sort{T}(d::DVector{T}; sample=true, kwargs...)
104 |     pids = procs(d)
105 |     np = length(pids)
106 | 
107 |     # Only `alg` and `sample` are supported as keyword arguments
108 |     if length(filter(x->!(x in (:alg, :by)), [x[1] for x in kwargs])) > 0
109 |         throw(ArgumentError("Only `alg`, `by` and `sample` are supported as keyword arguments"))
110 |     end
111 | 
112 |     if sample==true
113 |         boundaries, refs = compute_boundaries(d; kwargs...)
114 |         presorted=true
115 | 
116 |     elseif sample==false
117 |         # Assume an uniform distribution between min and max values
118 |         minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids)
119 |         min_d = minimum(T[x[1] for x in minmax])
120 |         max_d = maximum(T[x[2] for x in minmax])
121 | 
122 |         return sort(d; sample=(min_d,max_d), kwargs...)
123 | 
124 |     elseif isa(sample, Tuple)
125 |         # Assume an uniform distribution between min and max values in the tuple
126 |         lb=sample[1]
127 |         ub=sample[2]
128 | 
129 |         @assert lb<=ub
130 | 
131 |         s = Array{T}(np)
132 |         part = abs(ub - lb)/np
133 |         (isnan(part) || isinf(part)) && throw(ArgumentError("lower and upper bounds must not be infinities"))
134 | 
135 |         for n in 1:np
136 |             v = lb + (n-1)*part
137 |             if T <: Integer
138 |                 s[n] = round(v)
139 |             else
140 |                 s[n] = v
141 |             end
142 |         end
143 |         return sort(d; sample=s, kwargs...)
144 | 
145 |     elseif isa(sample, Array)
146 |         # Provided array is used as a sample
147 |         samples = sort(copy(sample))
148 |         samples[1] = typemin(T)
149 |         boundaries = samples[[1+(x-1)*div(length(samples), np) for x in 1:np]]
150 |         push!(boundaries, typemax(T))
151 |         presorted=false
152 | 
153 |         refs=RemoteChannel[RemoteChannel(p) for p in procs(d)]
154 |     else
155 |         throw(ArgumentError("keyword arg `sample` must be Boolean, Tuple(Min,Max) or an actual sample of data : " * string(sample)))
156 |     end
157 | 
158 |     local_sort_results = Array{Tuple}(np)
159 | 
160 |     Base.asyncmap!((i,p) -> remotecall_fetch(
161 |             scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...),
162 |                                     local_sort_results, 1:np, pids)
163 | 
164 |     # Construct a new DArray from the sorted refs. Remove parts with 0-length since
165 |     # the DArray constructor_from_refs does not yet support it. This implies that
166 |     # the participating workers for the sorted darray may be different from the original
167 |     # for highly non-uniform distributions.
168 |     local_sorted_refs = RemoteChannel[x[1] for x in filter(x->x[2]>0, local_sort_results)]
169 |     return DArray(local_sorted_refs)
170 | end
171 | 


--------------------------------------------------------------------------------
/src/spmd.jl:
--------------------------------------------------------------------------------
  1 | module SPMD
  2 | 
  3 | import DistributedArrays: gather, next_did, close
  4 | import Base.recvfrom  # UDP socket
  5 | export sendto, recvfrom, recvfrom_any, barrier, bcast, scatter, gather
  6 | export context_local_storage, context, spmd, close
  7 | 
  8 | 
  9 | type WorkerDataChannel
 10 |     pid::Int
 11 |     rc::Nullable{RemoteChannel}
 12 |     lock::ReentrantLock
 13 | 
 14 |     WorkerDataChannel(pid) = new(pid, Nullable{RemoteChannel}(), ReentrantLock())
 15 | end
 16 | 
 17 | type SPMDContext
 18 |     id::Tuple
 19 |     chnl::Channel
 20 |     store::Dict{Any,Any}
 21 |     pids::Array
 22 |     release::Bool
 23 | 
 24 |     function SPMDContext(id)
 25 |         ctxt = new(id, Channel(typemax(Int)), Dict{Any,Any}(), [], false)
 26 |         finalizer(ctxt, finalize_ctxt)
 27 |         ctxt
 28 |     end
 29 | end
 30 | 
 31 | function finalize_ctxt(ctxt::SPMDContext)
 32 |     ctxt.release && close(ctxt)
 33 | end
 34 | 
 35 | function context_local_storage()
 36 |     ctxt = get_ctxt_from_id(task_local_storage(:SPMD_CTXT))
 37 |     ctxt.store
 38 | end
 39 | 
 40 | function context(pids=procs())
 41 |     global map_ctxts
 42 |     ctxt = SPMDContext(next_did())
 43 |     ctxt.pids = pids
 44 |     ctxt.release = true
 45 |     ctxt
 46 | end
 47 | 
 48 | # Every worker is associated with its own RemoteChannel
 49 | const map_worker_channels = Dict{Int, WorkerDataChannel}()
 50 | 
 51 | # mapping between a context id and context object
 52 | const map_ctxts = Dict{Tuple, SPMDContext}()
 53 | 
 54 | # Multiple SPMD blocks can be executed concurrently,
 55 | # each in its own context. Messages are still sent as part of the
 56 | # same remote channels associated with each worker. They are
 57 | # read from the remote channel into local channels each associated
 58 | # with a different run of `spmd`.
 59 | 
 60 | function get_dc(wc::WorkerDataChannel)
 61 |     lock(wc.lock)
 62 |     try
 63 |         if isnull(wc.rc)
 64 |             if wc.pid == myid()
 65 |                 myrc = RemoteChannel(()->Channel(typemax(Int)))
 66 |                 wc.rc = Nullable{RemoteChannel}(myrc)
 67 | 
 68 |                 # start a task to transfer incoming messages into local
 69 |                 # channels based on the execution context
 70 |                 @schedule begin
 71 |                     while true
 72 |                         msg = take!(myrc)
 73 |                         ctxt_id = msg[1] # First element of the message tuple is the context id.
 74 |                         ctxt = get_ctxt_from_id(ctxt_id)
 75 |                         put!(ctxt.chnl, msg[2:end]) # stripping the context_id
 76 |                     end
 77 |                 end
 78 |             else
 79 |                 wc.rc = Nullable{RemoteChannel}(remotecall_fetch(()->get_remote_dc(myid()), wc.pid))
 80 |             end
 81 |         end
 82 |     finally
 83 |         unlock(wc.lock)
 84 |     end
 85 |     return get(wc.rc)
 86 | end
 87 | 
 88 | function get_ctxt_from_id(ctxt_id)
 89 |     global map_ctxts
 90 |     ctxt = get(map_ctxts, ctxt_id, nothing)
 91 |     if ctxt == nothing
 92 |         ctxt = SPMDContext(ctxt_id)
 93 |         map_ctxts[ctxt_id] = ctxt
 94 |     end
 95 |     return ctxt
 96 | end
 97 | 
 98 | 
 99 | # Since modules may be loaded in any order on the workers,
100 | # and workers may be dynamically added, pull in the remote channel
101 | # handles when accessed for the first time.
102 | function get_remote_dc(pid)
103 |     global map_worker_channels
104 |     if !haskey(map_worker_channels, pid)
105 |         map_worker_channels[pid] = WorkerDataChannel(pid)
106 |     end
107 | 
108 |     return get_dc(map_worker_channels[pid])
109 | end
110 | 
111 | function send_msg(to, typ, data, tag)
112 |     ctxt_id = task_local_storage(:SPMD_CTXT)
113 |     @async begin
114 |         dc = get_remote_dc(to)
115 |         put!(dc, (ctxt_id, typ, myid(), data, tag))
116 | #        println("Sent to ", dc)
117 |     end
118 | end
119 | 
120 | function get_msg(typ_check, from_check=false, tag_check=nothing)
121 |     ctxt_id = task_local_storage(:SPMD_CTXT)
122 |     chnl = get_ctxt_from_id(ctxt_id).chnl
123 | 
124 |     unexpected_msgs=[]
125 |     while true
126 |         typ, from, data, tag = take!(chnl)
127 | 
128 |         if (from_check != false && from_check != from) || (typ != typ_check) || (tag != tag_check)
129 |             push!(unexpected_msgs, (typ, from, data, tag))
130 | #            println("Unexpected in get_msg ", unexpected_msgs, " looking for ", typ_check, " ", from_check, " ", tag_check)
131 |         else
132 |             # put all the messages we read (but not expected) back to the local channel
133 |             foreach(x->put!(chnl, x), unexpected_msgs)
134 |             return (from, data)
135 |         end
136 |     end
137 | end
138 | 
139 | function sendto(pid::Int, data::Any; tag=nothing)
140 |     send_msg(pid, :sendto, data, tag)
141 | end
142 | 
143 | function recvfrom(pid::Int; tag=nothing)
144 |     _, data = get_msg(:sendto, pid, tag)
145 |     return data
146 | end
147 | 
148 | function recvfrom_any(; tag=nothing)
149 |     from, data = get_msg(:sendto, false, tag)
150 |     return (from,data)
151 | end
152 | 
153 | function barrier(;pids=procs(), tag=nothing)
154 |     # send a message to everyone
155 |     for p in sort(pids)
156 |         send_msg(p, :barrier, nothing, tag)
157 |     end
158 |     # make sure we recv a message from everyone
159 |     pending=deepcopy(pids)
160 |     unexpected_msgs=[]
161 | 
162 |     while length(pending) > 0
163 |         from, _ = get_msg(:barrier, false, tag)
164 |         if from in pending
165 |             filter!(x->x!=from, pending)
166 |         else
167 |             # handle case of 2 (or more) consecutive barrier calls.
168 |             push!(unexpected_msgs, (:barrier, from, nothing, tag))
169 | #            println("Unexpected ", from)
170 |         end
171 | #        length(pending) == 1 && println("Waiting for ", pending)
172 |     end
173 | 
174 |     ctxt_id = task_local_storage(:SPMD_CTXT)
175 |     chnl = get_ctxt_from_id(ctxt_id).chnl
176 |     foreach(x->put!(chnl, x), unexpected_msgs)
177 |     return nothing
178 | end
179 | 
180 | function bcast(data::Any, pid::Int; tag=nothing, pids=procs())
181 |     if myid() == pid
182 |         for p in filter(x->x!=pid, sort(pids))
183 |             send_msg(p, :bcast, data, tag)
184 |         end
185 |         return data
186 |     else
187 |         from, data = get_msg(:bcast, pid, tag)
188 |         return data
189 |     end
190 | end
191 | 
192 | function scatter(x, pid::Int; tag=nothing, pids=procs())
193 |     if myid() == pid
194 |         @assert rem(length(x), length(pids)) == 0
195 |         cnt = div(length(x), length(pids))
196 |         for (i,p) in enumerate(sort(pids))
197 |             p == pid && continue
198 |             send_msg(p, :scatter, x[cnt*(i-1)+1:cnt*i], tag)
199 |         end
200 |         myidx = findfirst(sort(pids), pid)
201 |         return x[cnt*(myidx-1)+1:cnt*myidx]
202 |     else
203 |         _, data = get_msg(:scatter, pid, tag)
204 |         return data
205 |     end
206 | end
207 | 
208 | function gather(x, pid::Int; tag=nothing, pids=procs())
209 |     if myid() == pid
210 |         gathered_data = Array{Any}(length(pids))
211 |         myidx = findfirst(sort(pids), pid)
212 |         gathered_data[myidx] = x
213 |         n = length(pids) - 1
214 |         while n > 0
215 |             from, data_x = get_msg(:gather, false, tag)
216 |             fromidx = findfirst(sort(pids), from)
217 |             gathered_data[fromidx] = data_x
218 |             n=n-1
219 |         end
220 |         return gathered_data
221 |     else
222 |         send_msg(pid, :gather, x, tag)
223 |         return x
224 |     end
225 | end
226 | 
227 | function spmd_local(f, ctxt_id, clear_ctxt)
228 |     task_local_storage(:SPMD_CTXT, ctxt_id)
229 |     f()
230 |     clear_ctxt && delete_ctxt_id(ctxt_id)
231 |     return nothing
232 | end
233 | 
234 | function spmd(f, args...; pids=procs(), context=nothing)
235 |     f_noarg = ()->f(args...)
236 |     clear_ctxt = false
237 |     if context == nothing
238 |         ctxt_id = next_did()
239 |         clear_ctxt = true    # temporary unique context created for this run.
240 |                              # should be cleared at the end of the run.
241 |     else
242 |         ctxt_id = context.id
243 |     end
244 |     @sync for p in pids
245 |         @async remotecall_fetch(spmd_local, p, f_noarg, ctxt_id, clear_ctxt)
246 |     end
247 |     nothing
248 | end
249 | 
250 | function delete_ctxt_id(ctxt_id)
251 |     global map_ctxts
252 |     haskey(map_ctxts, ctxt_id) && delete!(map_ctxts, ctxt_id)
253 |     nothing
254 | end
255 | 
256 | function close(ctxt::SPMDContext)
257 |     for p in ctxt.pids
258 |         Base.remote_do(delete_ctxt_id, p, ctxt.id)
259 |     end
260 |     ctxt.release = false
261 | end
262 | 
263 | end


--------------------------------------------------------------------------------
/src/linalg.jl:
--------------------------------------------------------------------------------
  1 | function Base.ctranspose{T}(D::DArray{T,2})
  2 |     DArray(reverse(size(D)), procs(D)) do I
  3 |         lp = Array{T}(map(length, I))
  4 |         rp = convert(Array, D[reverse(I)...])
  5 |         ctranspose!(lp, rp)
  6 |     end
  7 | end
  8 | 
  9 | function Base.transpose{T}(D::DArray{T,2})
 10 |     DArray(reverse(size(D)), procs(D)) do I
 11 |         lp = Array{T}(map(length, I))
 12 |         rp = convert(Array, D[reverse(I)...])
 13 |         transpose!(lp, rp)
 14 |     end
 15 | end
 16 | 
 17 | typealias DVector{T,A} DArray{T,1,A}
 18 | typealias DMatrix{T,A} DArray{T,2,A}
 19 | 
 20 | # Level 1
 21 | 
 22 | function axpy!(α, x::DVector, y::DVector)
 23 |     if length(x) != length(y)
 24 |         throw(DimensionMismatch("vectors must have same length"))
 25 |     end
 26 |     @sync for p in procs(y)
 27 |         @async remotecall_fetch(() -> (Base.axpy!(α, localpart(x), localpart(y)); nothing), p)
 28 |     end
 29 |     return y
 30 | end
 31 | 
 32 | function dot(x::DVector, y::DVector)
 33 |     if length(x) != length(y)
 34 |         throw(DimensionMismatch(""))
 35 |     end
 36 |     if (procs(x) != procs(y)) || (x.cuts != y.cuts)
 37 |         throw(ArgumentError("vectors don't have the same distribution. Not handled for efficiency reasons."))
 38 |     end
 39 | 
 40 |     results=Any[]
 41 |     @sync begin
 42 |         for i = eachindex(x.pids)
 43 |             @async push!(results, remotecall_fetch((x, y, i) -> dot(localpart(x), fetch(y, i)), x.pids[i], x, y, i))
 44 |         end
 45 |     end
 46 |     return reduce(+, results)
 47 | end
 48 | 
 49 | function norm(x::DVector, p::Real = 2)
 50 |     results = []
 51 |     @sync begin
 52 |         for pp in procs(x)
 53 |             @async push!(results, remotecall_fetch(() -> norm(localpart(x), p), pp))
 54 |         end
 55 |     end
 56 |     return norm(results, p)
 57 | end
 58 | 
 59 | Base.scale!(A::DArray, x::Number) = begin
 60 |     @sync for p in procs(A)
 61 |         @async remotecall_fetch((A,x)->(scale!(localpart(A), x); nothing), p, A, x)
 62 |     end
 63 |     return A
 64 | end
 65 | 
 66 | # Level 2
 67 | function add!(dest, src, scale = one(dest[1]))
 68 |     if length(dest) != length(src)
 69 |         throw(DimensionMismatch("source and destination arrays must have same number of elements"))
 70 |     end
 71 |     if scale == one(scale)
 72 |         @simd for i = eachindex(dest)
 73 |             @inbounds dest[i] += src[i]
 74 |         end
 75 |     else
 76 |         @simd for i = eachindex(dest)
 77 |             @inbounds dest[i] += scale*src[i]
 78 |         end
 79 |     end
 80 |     return dest
 81 | end
 82 | 
 83 | function A_mul_B!(α::Number, A::DMatrix, x::AbstractVector, β::Number, y::DVector)
 84 | 
 85 |     # error checks
 86 |     if size(A, 2) != length(x)
 87 |         throw(DimensionMismatch(""))
 88 |     end
 89 |     if y.cuts[1] != A.cuts[1]
 90 |         throw(ArgumentError("cuts of output vector must match cuts of first dimension of matrix"))
 91 |     end
 92 | 
 93 |     # Multiply on each tile of A
 94 |     R = Array{Future}(size(A.pids)...)
 95 |     for j = 1:size(A.pids, 2)
 96 |         xj = x[A.cuts[2][j]:A.cuts[2][j + 1] - 1]
 97 |         for i = 1:size(A.pids, 1)
 98 |             R[i,j] = remotecall(procs(A)[i,j]) do
 99 |                 localpart(A)*convert(localtype(x), xj)
100 |             end
101 |         end
102 |     end
103 | 
104 |     # Scale y if necessary
105 |     if β != one(β)
106 |         @sync for p in y.pids
107 |             if β != zero(β)
108 |                 @async remotecall_fetch(y -> (scale!(localpart(y), β); nothing), p, y)
109 |             else
110 |                 @async remotecall_fetch(y -> (fill!(localpart(y), 0); nothing), p, y)
111 |             end
112 |         end
113 |     end
114 | 
115 |     # Update y
116 |     @sync for i = 1:size(R, 1)
117 |         p = y.pids[i]
118 |         for j = 1:size(R, 2)
119 |             rij = R[i,j]
120 |             @async remotecall_fetch(() -> (add!(localpart(y), fetch(rij), α); nothing), p)
121 |         end
122 |     end
123 | 
124 |     return y
125 | end
126 | 
127 | function Ac_mul_B!(α::Number, A::DMatrix, x::AbstractVector, β::Number, y::DVector)
128 | 
129 |     # error checks
130 |     if size(A, 1) != length(x)
131 |         throw(DimensionMismatch(""))
132 |     end
133 |     if y.cuts[1] != A.cuts[2]
134 |         throw(ArgumentError("cuts of output vector must match cuts of second dimension of matrix"))
135 |     end
136 | 
137 |     # Multiply on each tile of A
138 |     R = Array{Future}(reverse(size(A.pids))...)
139 |     for j = 1:size(A.pids, 1)
140 |         xj = x[A.cuts[1][j]:A.cuts[1][j + 1] - 1]
141 |         for i = 1:size(A.pids, 2)
142 |             R[i,j] = remotecall(() -> localpart(A)'*convert(localtype(x), xj), procs(A)[j,i])
143 |         end
144 |     end
145 | 
146 |     # Scale y if necessary
147 |     if β != one(β)
148 |         @sync for p in y.pids
149 |             if β != zero(β)
150 |                 @async remotecall_fetch(() -> (scale!(localpart(y), β); nothing), p)
151 |             else
152 |                 @async remotecall_fetch(() -> (fill!(localpart(y), 0); nothing), p)
153 |             end
154 |         end
155 |     end
156 | 
157 |     # Update y
158 |     @sync for i = 1:size(R, 1)
159 |         p = y.pids[i]
160 |         for j = 1:size(R, 2)
161 |             rij = R[i,j]
162 |             @async remotecall_fetch(() -> (add!(localpart(y), fetch(rij), α); nothing), p)
163 |         end
164 |     end
165 |     return y
166 | end
167 | 
168 | function Base.LinAlg.scale!(b::AbstractVector, DA::DMatrix)
169 |     s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx
170 |         b[DA.indexes[pididx][1]]
171 |     end
172 |     map_localparts!(DA) do lDA
173 |         scale!(localpart(s), lDA)
174 |     end
175 | end
176 | 
177 | function Base.LinAlg.scale!(DA::DMatrix, b::AbstractVector)
178 |     s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx
179 |         b[DA.indexes[pididx][2]]
180 |     end
181 |     map_localparts!(DA) do lDA
182 |         scale!(lDA, localpart(s))
183 |     end
184 | end
185 | 
186 | # Level 3
187 | function _matmatmul!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix, tA)
188 |     # error checks
189 |     Ad1, Ad2 = (tA == 'N') ? (1,2) : (2,1)
190 |     mA, nA = size(A, Ad1, Ad2)
191 |     mB, nB = size(B)
192 |     if mB != nA
193 |         throw(DimensionMismatch("matrix A has dimensions ($mA, $nA), matrix B has dimensions ($mB, $nB)"))
194 |     end
195 |     if size(C,1) != mA || size(C,2) != nB
196 |         throw(DimensionMismatch("result C has dimensions $(size(C)), needs ($mA, $nB)"))
197 |     end
198 |     if C.cuts[1] != A.cuts[Ad1]
199 |         throw(ArgumentError("cuts of the first dimension of the output matrix must match cuts of dimension $Ad1 of the first input matrix"))
200 |     end
201 | 
202 |     # Multiply on each tile of A
203 |     if tA == 'N'
204 |         R = Array{Future}(size(procs(A))..., size(procs(C), 2))
205 |     else
206 |         R = Array{Future}(reverse(size(procs(A)))..., size(procs(C), 2))
207 |     end
208 |     for j = 1:size(A.pids, Ad2)
209 |         for k = 1:size(C.pids, 2)
210 |             Acuts = A.cuts[Ad2]
211 |             Ccuts = C.cuts[2]
212 |             Bjk = B[Acuts[j]:Acuts[j + 1] - 1, Ccuts[k]:Ccuts[k + 1] - 1]
213 |             for i = 1:size(A.pids, Ad1)
214 |                 p = (tA == 'N') ? procs(A)[i,j] : procs(A)[j,i]
215 |                 R[i,j,k] = remotecall(p) do
216 |                     if tA == 'T'
217 |                         return localpart(A).'*convert(localtype(B), Bjk)
218 |                     elseif tA == 'C'
219 |                         return localpart(A)'*convert(localtype(B), Bjk)
220 |                     else
221 |                         return localpart(A)*convert(localtype(B), Bjk)
222 |                     end
223 |                 end
224 |             end
225 |         end
226 |     end
227 | 
228 |     # Scale C if necessary
229 |     if β != one(β)
230 |         @sync for p in C.pids
231 |             if β != zero(β)
232 |                 @async remotecall_fetch(() -> (scale!(localpart(C), β); nothing), p)
233 |             else
234 |                 @async remotecall_fetch(() -> (fill!(localpart(C), 0); nothing), p)
235 |             end
236 |         end
237 |     end
238 | 
239 |     # Update C
240 |     @sync for i = 1:size(R, 1)
241 |         for k = 1:size(C.pids, 2)
242 |             p = C.pids[i,k]
243 |             for j = 1:size(R, 2)
244 |                 rijk = R[i,j,k]
245 |                 @async remotecall_fetch(d -> (add!(localpart(d), fetch(rijk), α); nothing), p, C)
246 |             end
247 |         end
248 |     end
249 |     return C
250 | end
251 | 
252 | A_mul_B!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix) = _matmatmul!(α, A, B, β, C, 'N')
253 | Ac_mul_B!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix) = _matmatmul!(α, A, B, β, C, 'C')
254 | At_mul_B!(α::Number, A::DMatrix, B::AbstractMatrix, β::Number, C::DMatrix) = _matmatmul!(α, A, B, β, C, 'T')
255 | At_mul_B!(C::DMatrix, A::DMatrix, B::AbstractMatrix) = At_mul_B!(one(eltype(C)), A, B, zero(eltype(C)), C)
256 | 
257 | _matmul_op = (t,s) -> t*s + t*s
258 | 
259 | function (*)(A::DMatrix, x::AbstractVector)
260 |     T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
261 |     y = DArray(I -> Array{T}(map(length, I)), (size(A, 1),), procs(A)[:,1], (size(procs(A), 1),))
262 |     return A_mul_B!(one(T), A, x, zero(T), y)
263 | end
264 | function (*)(A::DMatrix, B::AbstractMatrix)
265 |     T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
266 |     C = DArray(I -> Array{T}(map(length, I)),
267 |             (size(A, 1), size(B, 2)),
268 |             procs(A)[:,1:min(size(procs(A), 2), size(procs(B), 2))],
269 |             (size(procs(A), 1), min(size(procs(A), 2), size(procs(B), 2))))
270 |     return A_mul_B!(one(T), A, B, zero(T), C)
271 | end
272 | 
273 | function Ac_mul_B(A::DMatrix, x::AbstractVector)
274 |     T = Base.promote_op(_matmul_op, eltype(A), eltype(x))
275 |     y = DArray(I -> Array{T}(map(length, I)),
276 |             (size(A, 2),),
277 |             procs(A)[1,:],
278 |             (size(procs(A), 2),))
279 |     return Ac_mul_B!(one(T), A, x, zero(T), y)
280 | end
281 | function Ac_mul_B(A::DMatrix, B::AbstractMatrix)
282 |     T = Base.promote_op(_matmul_op, eltype(A), eltype(B))
283 |     C = DArray(I -> Array{T}(map(length, I)), (size(A, 2),
284 |         size(B, 2)),
285 |         procs(A)[1:min(size(procs(A), 1), size(procs(B), 2)),:],
286 |         (size(procs(A), 2), min(size(procs(A), 1), size(procs(B), 2))))
287 |     return Ac_mul_B!(one(T), A, B, zero(T), C)
288 | end
289 | 


--------------------------------------------------------------------------------
/src/mapreduce.jl:
--------------------------------------------------------------------------------
  1 | ## higher-order functions ##
  2 | 
  3 | Base.map(f, d::DArray) = DArray(I->map(f, localpart(d)), d)
  4 | 
  5 | Base.map!{F}(f::F, dest::DArray, src::DArray) = begin
  6 |     @sync for p in procs(dest)
  7 |         @async remotecall_fetch(() -> (map!(f, localpart(dest), src[localindexes(dest)...]); nothing), p)
  8 |     end
  9 |     return dest
 10 | end
 11 | 
 12 | Base.Broadcast._containertype{D<:DArray}(::Type{D}) = DArray
 13 | 
 14 | Base.Broadcast.promote_containertype(::Type{DArray}, ::Type{DArray}) = DArray
 15 | Base.Broadcast.promote_containertype(::Type{DArray}, ::Type{Array})  = DArray
 16 | Base.Broadcast.promote_containertype(::Type{DArray}, ct)             = DArray
 17 | Base.Broadcast.promote_containertype(::Type{Array}, ::Type{DArray})  = DArray
 18 | Base.Broadcast.promote_containertype(ct, ::Type{DArray})             = DArray
 19 | 
 20 | Base.Broadcast.broadcast_indices(::Type{DArray}, A)      = indices(A)
 21 | Base.Broadcast.broadcast_indices(::Type{DArray}, A::Ref) = ()
 22 | 
 23 | # FixMe!
 24 | ## 1. Support for arbitrary indices including OneTo
 25 | ## 2. This is as type unstable as it can be. Overhead might not matter too much for DArrays though.
 26 | function Base.Broadcast.broadcast_c(f, ::Type{DArray}, As...)
 27 |     T     = Base.Broadcast._broadcast_eltype(f, As...)
 28 |     shape = Base.Broadcast.broadcast_indices(As...)
 29 |     iter  = Base.CartesianRange(shape)
 30 |     D     = DArray(map(length, shape)) do I
 31 |         Base.Broadcast.broadcast_c(f, Array,
 32 |             map(a -> isa(a, Union{Number,Ref}) ? a :
 33 |                 localtype(a)(a[ntuple(i -> i > ndims(a) ? 1 : (size(a, i) == 1 ? (1:1) : I[i]), length(shape))...]), As)...)
 34 |     end
 35 |     return D
 36 | end
 37 | 
 38 | function Base.reduce(f, d::DArray)
 39 |     results=[]
 40 |     @sync begin
 41 |         for p in procs(d)
 42 |             @async push!(results, remotecall_fetch((f,d)->reduce(f, localpart(d)), p, f, d))
 43 |         end
 44 |     end
 45 |     reduce(f, results)
 46 | end
 47 | 
 48 | function _mapreduce(f, opt, d::DArray)
 49 | # TODO Change to an @async remotecall_fetch - will reduce one extra network hop -
 50 | # once bug in master is fixed.
 51 |     results=[]
 52 |     @sync begin
 53 |         for p in procs(d)
 54 |             @async push!(results, remotecall_fetch((f,opt,d)->mapreduce(f, opt, localpart(d)), p, f, opt, d))
 55 |         end
 56 |     end
 57 |     reduce(opt, results)
 58 | end
 59 | Base.mapreduce(f, opt::Union{typeof(|), typeof(&)}, d::DArray) = _mapreduce(f, opt, d)
 60 | Base.mapreduce(f, opt::Function, d::DArray) = _mapreduce(f, opt, d)
 61 | Base.mapreduce(f, opt, d::DArray) = _mapreduce(f, opt, d)
 62 | 
 63 | # mapreducedim
 64 | Base.reducedim_initarray{R}(A::DArray, region, v0, ::Type{R}) = begin
 65 |     # Store reduction on lowest pids
 66 |     pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...]
 67 |     chunks = similar(pids, Future)
 68 |     @sync for i in eachindex(pids)
 69 |         @async chunks[i...] = remotecall_wait(() -> Base.reducedim_initarray(localpart(A), region, v0, R), pids[i...])
 70 |     end
 71 |     return DArray(chunks)
 72 | end
 73 | Base.reducedim_initarray{T}(A::DArray, region, v0::T) = Base.reducedim_initarray(A, region, v0, T)
 74 | 
 75 | Base.reducedim_initarray0{R}(A::DArray, region, v0, ::Type{R}) = begin
 76 |     # Store reduction on lowest pids
 77 |     pids = A.pids[ntuple(i -> i in region ? (1:1) : (:), ndims(A))...]
 78 |     chunks = similar(pids, Future)
 79 |     @sync for i in eachindex(pids)
 80 |         @async chunks[i...] = remotecall_wait(() -> Base.reducedim_initarray0(localpart(A), region, v0, R), pids[i...])
 81 |     end
 82 |     return DArray(chunks)
 83 | end
 84 | Base.reducedim_initarray0{T}(A::DArray, region, v0::T) = Base.reducedim_initarray0(A, region, v0, T)
 85 | 
 86 | # Compute mapreducedim of each localpart and store the result in a new DArray
 87 | mapreducedim_within(f, op, A::DArray, region) = begin
 88 |     arraysize = [size(A)...]
 89 |     gridsize = [size(A.indexes)...]
 90 |     arraysize[[region...]] = gridsize[[region...]]
 91 |     indx = similar(A.indexes)
 92 |     for i in CartesianRange(size(indx))
 93 |         indx[i] = ntuple(j -> j in region ? (i.I[j]:i.I[j]) : A.indexes[i][j], ndims(A))
 94 |     end
 95 |     cuts = [i in region ? collect(1:arraysize[i] + 1) : A.cuts[i] for i in 1:ndims(A)]
 96 |     return DArray(next_did(), I -> mapreducedim(f, op, localpart(A), region),
 97 |         tuple(arraysize...), procs(A), indx, cuts)
 98 | end
 99 | 
100 | # Compute mapreducedim accros the processes. This should be done after mapreducedim
101 | # has been run on each localpart with mapreducedim_within. Eventually, we might
102 | # want to write mapreducedim_between! as a binary reduction.
103 | function mapreducedim_between!(f, op, R::DArray, A::DArray, region)
104 |     @sync for p in procs(R)
105 |         @async remotecall_fetch(p, f, op, R, A, region) do f, op, R, A, region
106 |             localind = [r for r = localindexes(A)]
107 |             localind[[region...]] = [1:n for n = size(A)[[region...]]]
108 |             B = convert(Array, A[localind...])
109 |             Base.mapreducedim!(f, op, localpart(R), B)
110 |             nothing
111 |         end
112 |     end
113 |     return R
114 | end
115 | 
116 | Base.mapreducedim!(f, op, R::DArray, A::DArray) = begin
117 |     lsize = Base.check_reducedims(R,A)
118 |     if isempty(A)
119 |         return copy(R)
120 |     end
121 |     region = tuple(collect(1:ndims(A))[[size(R)...] .!= [size(A)...]]...)
122 |     if isempty(region)
123 |         return copy!(R, A)
124 |     end
125 |     B = mapreducedim_within(f, op, A, region)
126 |     return mapreducedim_between!(identity, op, R, B, region)
127 | end
128 | 
129 | Base.mapreducedim(f, op, R::DArray, A::DArray) = begin
130 |     Base.mapreducedim!(f, op, Base.reducedim_initarray(A, region, v0), A)
131 | end
132 | 
133 | function nnz(A::DArray)
134 |     B = Array{Any}(size(A.pids))
135 |     @sync begin
136 |         for i in eachindex(A.pids)
137 |             @async B[i...] = remotecall_fetch(x -> nnz(localpart(x)), A.pids[i...], A)
138 |         end
139 |     end
140 |     return reduce(+, B)
141 | end
142 | 
143 | # reduce like
144 | for (fn, fr) in ((:sum, :+),
145 |                  (:prod, :*),
146 |                  (:maximum, :max),
147 |                  (:minimum, :min),
148 |                  (:any, :|),
149 |                  (:all, :&))
150 |     @eval (Base.$fn)(d::DArray) = reduce($fr, d)
151 | end
152 | 
153 | # mapreduce like
154 | for (fn, fr1, fr2) in ((:maxabs, :abs, :max),
155 |                        (:minabs, :abs, :min),
156 |                        (:sumabs, :abs, :+),
157 |                        (:sumabs2, :abs2, :+))
158 |     @eval (Base.$fn)(d::DArray) = mapreduce($fr1, $fr2, d)
159 | end
160 | 
161 | # semi mapreduce
162 | for (fn, fr) in ((:any, :|),
163 |                  (:all, :&),
164 |                  (:count, :+))
165 |     @eval begin
166 |         (Base.$fn)(f::typeof(identity), d::DArray) = mapreduce(f, $fr, d)
167 |         (Base.$fn)(f::Callable, d::DArray) = mapreduce(f, $fr, d)
168 |     end
169 | end
170 | 
171 | # Unary vector functions
172 | (-)(D::DArray) = map(-, D)
173 | 
174 | @static if VERSION < v"0.6.0-dev.1731"
175 |     # scalar ops
176 |     (+)(A::DArray{Bool}, x::Bool) = A .+ x
177 |     (+)(x::Bool, A::DArray{Bool}) = x .+ A
178 |     (-)(A::DArray{Bool}, x::Bool) = A .- x
179 |     (-)(x::Bool, A::DArray{Bool}) = x .- A
180 |     (+)(A::DArray, x::Number) = A .+ x
181 |     (+)(x::Number, A::DArray) = x .+ A
182 |     (-)(A::DArray, x::Number) = A .- x
183 |     (-)(x::Number, A::DArray) = x .- A
184 | end
185 | 
186 | map_localparts(f::Callable, d::DArray) = DArray(i->f(localpart(d)), d)
187 | map_localparts(f::Callable, d1::DArray, d2::DArray) = DArray(d1) do I
188 |     f(localpart(d1), localpart(d2))
189 | end
190 | 
191 | function map_localparts(f::Callable, DA::DArray, A::Array)
192 |     s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx
193 |         A[DA.indexes[pididx]...]
194 |     end
195 |     DArray(DA) do I
196 |         f(localpart(DA), localpart(s))
197 |     end
198 | end
199 | 
200 | function map_localparts(f::Callable, A::Array, DA::DArray)
201 |     s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx
202 |         A[DA.indexes[pididx]...]
203 |     end
204 |     DArray(DA) do I
205 |         f(localpart(s), localpart(DA))
206 |     end
207 | end
208 | 
209 | function map_localparts!(f::Callable, d::DArray)
210 |     @sync for p in procs(d)
211 |         @async remotecall_fetch((f,d)->(f(localpart(d)); nothing), p, f, d)
212 |     end
213 |     return d
214 | end
215 | 
216 | # Here we assume all the DArrays have
217 | # the same size and distribution
218 | map_localparts(f::Callable, As::DArray...) = DArray(I->f(map(localpart, As)...), As[1])
219 | 
220 | @static if VERSION < v"0.6.0-dev.1632"
221 |     for f in (:.+, :.-, :.*, :./, :.%, :.<<, :.>>, :div, :mod, :rem, :&, :|, :$)
222 |         @eval begin
223 |             ($f){T}(A::DArray{T}, B::Number) = map_localparts(r->($f)(r, B), A)
224 |             ($f){T}(A::Number, B::DArray{T}) = map_localparts(r->($f)(A, r), B)
225 |         end
226 |     end
227 | end
228 | 
229 | function samedist(A::DArray, B::DArray)
230 |     (size(A) == size(B)) || throw(DimensionMismatch())
231 |     if (procs(A) != procs(B)) || (A.cuts != B.cuts)
232 |         B = DArray(x->B[x...], A)
233 |     end
234 |     B
235 | end
236 | 
237 | for f in (:+, :-, :div, :mod, :rem, :&, :|, :$)
238 |     @eval begin
239 |         function ($f){T}(A::DArray{T}, B::DArray{T})
240 |             B = samedist(A, B)
241 |             map_localparts($f, A, B)
242 |         end
243 |         ($f){T}(A::DArray{T}, B::Array{T}) = map_localparts($f, A, B)
244 |         ($f){T}(A::Array{T}, B::DArray{T}) = map_localparts($f, A, B)
245 |     end
246 | end
247 | @static if VERSION < v"0.6.0-dev.1632"
248 |     for f in (:.+, :.-, :.*, :./, :.%, :.<<, :.>>)
249 |         @eval begin
250 |             function ($f){T}(A::DArray{T}, B::DArray{T})
251 |                 map_localparts($f, A, B)
252 |             end
253 |             ($f){T}(A::DArray{T}, B::Array{T}) = map_localparts($f, A, B)
254 |             ($f){T}(A::Array{T}, B::DArray{T}) = map_localparts($f, A, B)
255 |         end
256 |     end
257 | end
258 | 
259 | function mapslices{T,N,A}(f::Function, D::DArray{T,N,A}, dims::AbstractVector)
260 |     if !all(t -> t == 1, size(D.indexes)[dims])
261 |         p = ones(Int, ndims(D))
262 |         nondims = filter(t -> !(t in dims), 1:ndims(D))
263 |         p[nondims] = defaultdist([size(D)...][[nondims...]], procs(D))
264 |         DD = DArray(size(D), procs(D), p) do I
265 |             return convert(A, D[I...])
266 |         end
267 |         return mapslices(f, DD, dims)
268 |     end
269 | 
270 |     refs = Future[remotecall((x,y,z)->mapslices(x,localpart(y),z), p, f, D, dims) for p in procs(D)]
271 | 
272 |     DArray(reshape(refs, size(procs(D))))
273 | end
274 | 
275 | function _ppeval(f, A...; dim = map(ndims, A))
276 |     if length(dim) != length(A)
277 |         throw(ArgumentError("dim argument has wrong length. length(dim) = $(length(dim)) but should be $(length(A))"))
278 |     end
279 |     narg = length(A)
280 |     dimlength = size(A[1], dim[1])
281 |     for i = 2:narg
282 |         if dim[i] > 0 && dimlength != size(A[i], dim[i])
283 |             throw(ArgumentError("lengths of broadcast dimensions must be the same. size(A[1], $(dim[1])) = $dimlength but size(A[$i], $(dim[i])) = $(size(A[i], dim[i]))"))
284 |         end
285 |     end
286 |     dims = []
287 |     idx  = []
288 |     args = []
289 |     for i = 1:narg
290 |         push!(dims, ndims(A[i]))
291 |         push!(idx, Any[1:size(A[i], d) for d in 1:dims[i]])
292 |         if dim[i] > 0
293 |             idx[i][dim[i]] = 1
294 |             push!(args, view(A[i], idx[i]...))
295 |         else
296 |             push!(args, A[i])
297 |         end
298 |     end
299 |     R1 = f(args...)
300 |     ridx = Any[1:size(R1, d) for d in 1:ndims(R1)]
301 |     push!(ridx, 1)
302 |     Rsize = map(last, ridx)
303 |     Rsize[end] = dimlength
304 |     R = Array{eltype(R1)}(Rsize...)
305 | 
306 |     for i = 1:dimlength
307 |         for j = 1:narg
308 |             if dim[j] > 0
309 |                 idx[j][dim[j]] = i
310 |                 args[j] = view(A[j], idx[j]...)
311 |             else
312 |                 args[j] = A[j]
313 |             end
314 |         end
315 |         ridx[end] = i
316 |         R[ridx...] = f(args...)
317 |     end
318 | 
319 |     return R
320 | end
321 | 
322 | """
323 |      ppeval(f, D...; dim::NTuple)
324 | 
325 | Evaluates the callable argument `f` on slices of the elements of the `D` tuple.
326 | 
327 | #### Arguments
328 | `f` can be any callable object that accepts sliced or broadcasted elements of `D`.
329 | The result returned from `f` must be either an array or a scalar.
330 | 
331 | `D` has any number of elements and the alements can have any type. If an element
332 | of `D` is a distributed array along the dimension specified by `dim`. If an
333 | element of `D` is not distributed, the element is by default broadcasted and
334 | applied on all evaluations of `f`.
335 | 
336 | `dim` is a tuple of integers specifying the dimension over which the elements
337 | of `D` is slices. The length of the tuple must therefore be the same as the
338 | number of arguments `D`. By default distributed arrays are slides along the
339 | last dimension. If the value is less than or equal to zero the element are
340 | broadcasted to all evaluations of `f`.
341 | 
342 | #### Result
343 | `ppeval` returns a distributed array of dimension `p+1` where the first `p`
344 | sizes correspond to the sizes of return values of `f`. The last dimention of
345 | the return array from `ppeval` has the same length as the dimension over which
346 | the input arrays are sliced.
347 | 
348 | #### Examples
349 | ```jl
350 | addprocs(JULIA_CPU_CORES)
351 | 
352 | using DistributedArrays
353 | 
354 | A = drandn((10, 10, JULIA_CPU_CORES), workers(), [1, 1, JULIA_CPU_CORES])
355 | 
356 | ppeval(eigvals, A)
357 | 
358 | ppeval(eigvals, A, randn(10,10)) # broadcasting second argument
359 | 
360 | B = drandn((10, JULIA_CPU_CORES), workers(), [1, JULIA_CPU_CORES])
361 | 
362 | ppeval(*, A, B)
363 | ```
364 | """
365 | function ppeval(f, D...; dim::NTuple = map(t -> isa(t, DArray) ? ndims(t) : 0, D))
366 |     #Ensure that the complete DArray is available on the specified dims on all processors
367 |     for i = 1:length(D)
368 |         if isa(D[i], DArray)
369 |             for idxs in D[i].indexes
370 |                 for d in setdiff(1:ndims(D[i]), dim[i])
371 |                     if length(idxs[d]) != size(D[i], d)
372 |                         throw(DimensionMismatch(string("dimension $d is distributed. ",
373 |                             "ppeval requires dimension $d to be completely available on all processors.")))
374 |                     end
375 |                 end
376 |             end
377 |         end
378 |     end
379 | 
380 |     refs = Future[remotecall((x, y, z) -> _ppeval(x, map(localpart, y)...; dim = z), p, f, D, dim) for p in procs(D[1])]
381 | 
382 |     # The array of Futures has to be reshaped for the DArray constructor to work correctly.
383 |     # This requires a fetch and the DArray is also fetching so it might be better to modify
384 |     # the DArray constructor.
385 |     sd = [size(D[1].pids)...]
386 |     nd = remotecall_fetch((r)->ndims(fetch(r)), refs[1].where, refs[1])
387 |     DArray(reshape(refs, tuple([sd[1:nd - 1], sd[end];]...)))
388 | end
389 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DistributedArrays.jl
  2 | 
  3 | [![Build Status](https://travis-ci.org/JuliaParallel/DistributedArrays.jl.svg?branch=master)](https://travis-ci.org/JuliaParallel/DistributedArrays.jl)
  4 | [![Coverage Status](https://coveralls.io/repos/github/JuliaParallel/DistributedArrays.jl/badge.svg?branch=master)](https://coveralls.io/github/JuliaParallel/DistributedArrays.jl?branch=master)
  5 | [![codecov](https://codecov.io/gh/JuliaParallel/DistributedArrays.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaParallel/DistributedArrays.jl)
  6 | 
  7 | Distributed Arrays for Julia
  8 | 
  9 | ***NOTE***
 10 | Distributed Arrays will only work on Julia v0.4.0 or later.
 11 | 
 12 | `DArray`s have been removed from Julia Base library in v0.4 so it is now necessary to import the `DistributedArrays` package on all spawned processes.
 13 | 
 14 | ```julia
 15 | @everywhere using DistributedArrays
 16 | ```
 17 | 
 18 | Distributed Arrays
 19 | ------------------
 20 | 
 21 | Large computations are often organized around large arrays of data. In
 22 | these cases, a particularly natural way to obtain parallelism is to
 23 | distribute arrays among several processes. This combines the memory
 24 | resources of multiple machines, allowing use of arrays too large to fit
 25 | on one machine. Each process operates on the part of the array it
 26 | owns, providing a ready answer to the question of how a program should
 27 | be divided among machines.
 28 | 
 29 | Julia distributed arrays are implemented by the `DArray` type. A
 30 | `DArray` has an element type and dimensions just like an `Array`.
 31 | A `DArray` can also use arbitrary array-like types to represent the local
 32 | chunks that store actual data. The data in a `DArray` is distributed by
 33 | dividing the index space into some number of blocks in each dimension.
 34 | 
 35 | Common kinds of arrays can be constructed with functions beginning with
 36 | `d`:
 37 | 
 38 | ```julia
 39 |     dzeros(100,100,10)
 40 |     dones(100,100,10)
 41 |     drand(100,100,10)
 42 |     drandn(100,100,10)
 43 |     dfill(x,100,100,10)
 44 | ```
 45 | 
 46 | In the last case, each element will be initialized to the specified
 47 | value `x`. These functions automatically pick a distribution for you.
 48 | For more control, you can specify which processes to use, and how the
 49 | data should be distributed:
 50 | 
 51 | ```julia
 52 |     dzeros((100,100), workers()[1:4], [1,4])
 53 | ```
 54 | 
 55 | The second argument specifies that the array should be created on the first
 56 | four workers. When dividing data among a large number of processes,
 57 | one often sees diminishing returns in performance. Placing `DArray`\ s
 58 | on a subset of processes allows multiple `DArray` computations to
 59 | happen at once, with a higher ratio of work to communication on each
 60 | process.
 61 | 
 62 | The third argument specifies a distribution; the nth element of
 63 | this array specifies how many pieces dimension n should be divided into.
 64 | In this example the first dimension will not be divided, and the second
 65 | dimension will be divided into 4 pieces. Therefore each local chunk will be
 66 | of size `(100,25)`. Note that the product of the distribution array must
 67 | equal the number of processes.
 68 | 
 69 | * `distribute(a::Array)` converts a local array to a distributed array.
 70 | 
 71 | * `localpart(d::DArray)` obtains the locally-stored portion
 72 | of a  `DArray`.
 73 | 
 74 | * Localparts can be retrived and set via the indexing syntax too.
 75 | Indexing via symbols is used for this, specifically symbols `:L`,`:LP`,`:l`,`:lp` which
 76 | are all equivalent. For example, `d[:L]` returns the localpart of `d`
 77 | while `d[:L]=v` sets `v` as the localpart of `d`.
 78 | 
 79 | * `localindexes(a::DArray)` gives a tuple of the index ranges owned by the
 80 | local process.
 81 | 
 82 | * `convert(Array, a::DArray)` brings all the data to the local process.
 83 | 
 84 | Indexing a `DArray` (square brackets) with ranges of indexes always
 85 | creates a `SubArray`, not copying any data.
 86 | 
 87 | 
 88 | Constructing Distributed Arrays
 89 | -------------------------------
 90 | 
 91 | The primitive `DArray` constructor has the following somewhat elaborate signature:
 92 | 
 93 | ```julia
 94 |     DArray(init, dims[, procs, dist])
 95 | ```
 96 | 
 97 | `init` is a function that accepts a tuple of index ranges. This function should
 98 | allocate a local chunk of the distributed array and initialize it for the specified
 99 | indices. `dims` is the overall size of the distributed array.
100 | `procs` optionally specifies a vector of process IDs to use.
101 | `dist` is an integer vector specifying how many chunks the
102 | distributed array should be divided into in each dimension.
103 | 
104 | The last two arguments are optional, and defaults will be used if they
105 | are omitted.
106 | 
107 | As an example, here is how to turn the local array constructor `fill`
108 | into a distributed array constructor:
109 | 
110 | ```julia
111 |     dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)
112 | ```
113 | 
114 | In this case the `init` function only needs to call `fill` with the
115 | dimensions of the local piece it is creating.
116 | 
117 | `DArray`s can also be constructed from multidimensional `Array` comprehensions with
118 | the `@DArray` macro syntax.  This syntax is just sugar for the primitive `DArray` constructor:
119 | 
120 | ```julia
121 | julia> [i+j for i = 1:5, j = 1:5]
122 | 5x5 Array{Int64,2}:
123 |  2  3  4  5   6
124 |  3  4  5  6   7
125 |  4  5  6  7   8
126 |  5  6  7  8   9
127 |  6  7  8  9  10
128 | 
129 | julia> @DArray [i+j for i = 1:5, j = 1:5]
130 | 5x5 DistributedArrays.DArray{Int64,2,Array{Int64,2}}:
131 |  2  3  4  5   6
132 |  3  4  5  6   7
133 |  4  5  6  7   8
134 |  5  6  7  8   9
135 |  6  7  8  9  10
136 | ```
137 | 
138 | Distributed Array Operations
139 | ----------------------------
140 | 
141 | At this time, distributed arrays do not have much functionality. Their
142 | major utility is allowing communication to be done via array indexing, which
143 | is convenient for many problems. As an example, consider implementing the
144 | "life" cellular automaton, where each cell in a grid is updated according
145 | to its neighboring cells. To compute a chunk of the result of one iteration,
146 | each process needs the immediate neighbor cells of its local chunk. The
147 | following code accomplishes this::
148 | 
149 | ```julia
150 |     function life_step(d::DArray)
151 |         DArray(size(d),procs(d)) do I
152 |             top   = mod(first(I[1])-2,size(d,1))+1
153 |             bot   = mod( last(I[1])  ,size(d,1))+1
154 |             left  = mod(first(I[2])-2,size(d,2))+1
155 |             right = mod( last(I[2])  ,size(d,2))+1
156 | 
157 |             old = Array(Bool, length(I[1])+2, length(I[2])+2)
158 |             old[1      , 1      ] = d[top , left]   # left side
159 |             old[2:end-1, 1      ] = d[I[1], left]
160 |             old[end    , 1      ] = d[bot , left]
161 |             old[1      , 2:end-1] = d[top , I[2]]
162 |             old[2:end-1, 2:end-1] = d[I[1], I[2]]   # middle
163 |             old[end    , 2:end-1] = d[bot , I[2]]
164 |             old[1      , end    ] = d[top , right]  # right side
165 |             old[2:end-1, end    ] = d[I[1], right]
166 |             old[end    , end    ] = d[bot , right]
167 | 
168 |             life_rule(old)
169 |         end
170 |     end
171 | ```
172 | 
173 | As you can see, we use a series of indexing expressions to fetch
174 | data into a local array `old`. Note that the `do` block syntax is
175 | convenient for passing `init` functions to the `DArray` constructor.
176 | Next, the serial function `life_rule` is called to apply the update rules
177 | to the data, yielding the needed `DArray` chunk. Nothing about `life_rule`
178 | is `DArray`\ -specific, but we list it here for completeness::
179 | 
180 | ```julia
181 |     function life_rule(old)
182 |         m, n = size(old)
183 |         new = similar(old, m-2, n-2)
184 |         for j = 2:n-1
185 |             for i = 2:m-1
186 |                 nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1],
187 |                        old[i  ,j-1],             old[i  ,j+1],
188 |                        old[i+1,j-1], old[i+1,j], old[i+1,j+1])
189 |                 new[i-1,j-1] = (nc == 3 || nc == 2 && old[i,j])
190 |             end
191 |         end
192 |         new
193 |     end
194 | ```
195 | 
196 | Numerical Results of Distributed Computations
197 | ---------------------------------------------
198 | 
199 | Floating point arithmetic is not associative and this comes up
200 | when performing distributed computations over `DArray`s.  All `DArray`
201 | operations are performed over the `localpart` chunks and then aggregated.
202 | The change in ordering of the operations will change the numeric result as
203 | seen in this simple example:
204 | 
205 | ```julia
206 | julia> addprocs(8);
207 | 
208 | julia> @everywhere using DistributedArrays
209 | 
210 | julia> A = fill(1.1, (100,100));
211 | 
212 | julia> sum(A)
213 | 11000.000000000013
214 | 
215 | julia> DA = distribute(A);
216 | 
217 | julia> sum(DA)
218 | 11000.000000000127
219 | 
220 | julia> sum(A) == sum(DA)
221 | false
222 | ```
223 | 
224 | The ultimate ordering of operations will be dependent on how the Array is distributed.
225 | 
226 | Garbage Collection and DArrays
227 | ------------------------------
228 | 
229 | When a DArray is constructed (typically on the master process), the returned DArray objects stores information on how the
230 | array is distributed, which procesor holds which indexes and so on. When the DArray object
231 | on the master process is garbage collected, all particpating workers are notified and
232 | localparts of the DArray freed on each worker.
233 | 
234 | Since the size of the DArray object itself is small, a problem arises as `gc` on the master faces no memory pressure to
235 | collect the DArray immediately. This results in a delay of the memory being released on the participating workers.
236 | 
237 | Therefore it is highly recommended to explcitly call `close(d::DArray)` as soon as user code
238 | has finished working with the distributed array.
239 | 
240 | It is also important to note that the localparts of the DArray is collected from all particpating workers
241 | when the DArray object on the process creating the DArray is collected. It is therefore important to maintain
242 | a reference to a DArray object on the creating process for as long as it is being computed upon.
243 | 
244 | `darray_closeall()` is another useful function to manage distributed memory. It releases all darrays created from
245 | the calling process, including any temporaries created during computation.
246 | 
247 | Working with distributed non-array data
248 | ---------------------------------------
249 | 
250 | The function `ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])` can be used
251 | to created a distributed vector whose localparts need not be Arrays.
252 | 
253 | It returns a `DArray{T,1,T}`, i.e., the element type and localtype of the array are the same.
254 | 
255 | `ddata()` constructs a distributed vector of length `nworkers()` where each localpart can hold any value,
256 | initially initialized to `nothing`.
257 | 
258 | Argument `data` if supplied is distributed over the `pids`. `length(data)` must be a multiple of `length(pids)`.
259 | If the multiple is 1, returns a `DArray{T,1,T}` where T is `eltype(data)`. If the multiple is greater than 1,
260 | returns a `DArray{T,1,Array{T,1}}`, i.e., it is equivalent to calling `distribute(data)`.
261 | 
262 | `gather{T}(d::DArray{T,1,T})` returns an Array{T,1} consisting of all distributed elements of `d`
263 | 
264 | Given a `DArray{T,1,T}` object `d`, `d[:L]` returns the localpart on a worker. `d[i]` returns the `localpart`
265 | on the ith worker that `d` is distributed over.
266 | 
267 | SPMD Mode (An MPI Style SPMD mode with MPI like primitives)
268 | ------------------------------------------------------------
269 | SPMD, i.e., a Single Program Multiple Data mode is implemented by submodule `DistributedArrays.SPMD`. In this mode the same function is executed in parallel on all participating nodes. This is a typical style of MPI programs where the same program is executed on all processors. A basic subset of MPI-like primitives are currently supported. As a programming model it should be familiar to folks with an MPI background.
270 | 
271 | The same block of code is executed concurrently on all workers using the `spmd` function.
272 | 
273 | ```
274 | # define foo() on all workers
275 | @everywhere function foo(arg1, arg2)
276 |     ....
277 | end
278 | 
279 | # call foo() everywhere using the `spmd` function
280 | d_in=DArray(.....)
281 | d_out=ddata()
282 | spmd(foo,d_in,d_out; pids=workers()) # executes on all workers
283 | ```
284 | 
285 | `spmd` is defined as `spmd(f, args...; pids=procs(), context=nothing)`
286 | 
287 | `args` is one or more arguments to be passed to `f`. `pids` identifies the workers
288 | that `f` needs to be run on. `context` identifies a run context, which is explained
289 | later.
290 | 
291 | The following primitives can be used in SPMD mode.
292 | 
293 | - `sendto(pid, data; tag=nothing)` - sends `data` to `pid`
294 | 
295 | - `recvfrom(pid; tag=nothing)` - receives data from `pid`
296 | 
297 | - `recvfrom_any(; tag=nothing)` - receives data from any `pid`
298 | 
299 | - `barrier(;pids=procs(), tag=nothing)` - all tasks wait and then proceeed
300 | 
301 | - `bcast(data, pid; tag=nothing, pids=procs())` - broadcasts the same data over `pids` from `pid`
302 | 
303 | - `scatter(x, pid; tag=nothing, pids=procs())` - distributes `x` over `pids` from `pid`
304 | 
305 | - `gather(x, pid; tag=nothing, pids=procs())` - collects data from `pids` onto worker `pid`
306 | 
307 | Tag `tag` should be used to differentiate between consecutive calls of the same type, for example,
308 | consecutive `bcast` calls.
309 | 
310 | `spmd` and spmd related functions are defined in submodule `DistributedArrays.SPMD`. You will need to
311 | import it explcitly, or prefix functions that can can only be used in spmd mode with `SPMD.`, for example,
312 | `SPMD.sendto`.
313 | 
314 | Example
315 | -------
316 | 
317 | This toy example exchanges data with each of its neighbors `n` times.
318 | 
319 | ```
320 | using DistributedArrays
321 | addprocs(8)
322 | @everywhere importall DistributedArrays
323 | @everywhere importall DistributedArrays.SPMD
324 | 
325 | d_in=d=DArray(I->fill(myid(), (map(length,I)...)), (nworkers(), 2), workers(), [nworkers(),1])
326 | d_out=ddata()
327 | 
328 | # define the function everywhere
329 | @everywhere function foo_spmd(d_in, d_out, n)
330 |     pids = sort(vec(procs(d_in)))
331 |     pididx = findfirst(pids, myid())
332 |     mylp = d_in[:L]
333 |     localsum = 0
334 | 
335 |     # Have each worker exchange data with its neighbors
336 |     n_pididx = pididx+1 > length(pids) ? 1 : pididx+1
337 |     p_pididx = pididx-1 < 1 ? length(pids) : pididx-1
338 | 
339 |     for i in 1:n
340 |         sendto(pids[n_pididx], mylp[2])
341 |         sendto(pids[p_pididx], mylp[1])
342 | 
343 |         mylp[2] = recvfrom(pids[p_pididx])
344 |         mylp[1] = recvfrom(pids[n_pididx])
345 | 
346 |         barrier(;pids=pids)
347 |         localsum = localsum + mylp[1] + mylp[2]
348 |     end
349 | 
350 |     # finally store the sum in d_out
351 |     d_out[:L] = localsum
352 | end
353 | 
354 | # run foo_spmd on all workers
355 | spmd(foo_spmd, d_in, d_out, 10)
356 | 
357 | # print values of d_in and d_out after the run
358 | println(d_in)
359 | println(d_out)
360 | ```
361 | 
362 | SPMD Context
363 | ------------
364 | 
365 | Each SPMD run is implictly executed in a different context. This allows for multiple `spmd` calls to
366 | be active at the same time. A SPMD context can be explicitly specified via keyword arg `context` to `spmd`.
367 | 
368 | `context(pids=procs())` returns a new SPMD context.
369 | 
370 | A SPMD context also provides a context local storage, a dict, which can be used to store
371 | key-value pairs between spmd runs under the same context.
372 | 
373 | `context_local_storage()` returns the dictionary associated with the context.
374 | 
375 | NOTE: Implicitly defined contexts, i.e., `spmd` calls without specifying a `context` create a context
376 | which live only for the duration of the call. Explictly created context objects can be released
377 | early by calling `close(ctxt::SPMDContext)`. This will release the local storage dictionaries
378 | on all participating `pids`. Else they will be released when the context object is gc'ed
379 | on the node that created it.
380 | 
381 | 
382 | Nested `spmd` calls
383 | -------------------
384 | As `spmd` executes the the specified function on all participating nodes, we need to be careful with nesting `spmd` calls.
385 | 
386 | An example of an unsafe(wrong) way:
387 | ```
388 | function foo(.....)
389 |     ......
390 |     spmd(bar, ......)
391 |     ......
392 | end
393 | 
394 | function bar(....)
395 |     ......
396 |     spmd(baz, ......)
397 |     ......
398 | end
399 | 
400 | spmd(foo,....)
401 | ```
402 | In the above example, `foo`, `bar` and `baz` are all functions wishing to leverage distributed computation. However, they themselves may be currenty part of a `spmd` call. A safe way to handle such a scenario is to only drive parallel computation from the master process.
403 | 
404 | The correct way (only have the driver process initiate `spmd` calls):
405 | ```
406 | function foo()
407 |     ......
408 |     myid()==1 && spmd(bar, ......)
409 |     ......
410 | end
411 | 
412 | function bar()
413 |     ......
414 |     myid()==1 && spmd(baz, ......)
415 |     ......
416 | end
417 | 
418 | spmd(foo,....)
419 | ```
420 | 
421 | This is also true of functions which automatically distribute computation on DArrays.
422 | ```
423 | function foo(d::DArray)
424 |     ......
425 |     myid()==1 && map!(bar, d)
426 |     ......
427 | end
428 | spmd(foo,....)
429 | ```
430 | Without the `myid()` check, the `spmd` call to `foo` would execute `map!` from all nodes, which is not what we probably want. 
431 | 
432 | Similarly `@everywhere` from within a SPMD run should also be driven from the master node only.
433 | 


--------------------------------------------------------------------------------
/src/darray.jl:
--------------------------------------------------------------------------------
  1 | """
  2 |     DArray(init, dims, [procs, dist])
  3 | 
  4 | Construct a distributed array.
  5 | 
  6 | The parameter `init` is a function that accepts a tuple of index ranges.
  7 | This function should allocate a local chunk of the distributed array and initialize it for the specified indices.
  8 | 
  9 | `dims` is the overall size of the distributed array.
 10 | 
 11 | `procs` optionally specifies a vector of process IDs to use.
 12 | If unspecified, the array is distributed over all worker processes only. Typically, when running in distributed mode,
 13 | i.e., nprocs() > 1, this would mean that no chunk of the distributed array exists on the process hosting the
 14 | interactive julia prompt.
 15 | 
 16 | `dist` is an integer vector specifying how many chunks the distributed array should be divided into in each dimension.
 17 | 
 18 | For example, the `dfill` function that creates a distributed array and fills it with a value `v` is implemented as:
 19 | 
 20 | ### Example
 21 | ```jl
 22 | dfill(v, args...) = DArray(I->fill(v, map(length,I)), args...)
 23 | ```
 24 | """
 25 | type DArray{T,N,A} <: AbstractArray{T,N}
 26 |     id::Tuple
 27 |     dims::NTuple{N,Int}
 28 |     pids::Array{Int,N}                          # pids[i]==p ⇒ processor p has piece i
 29 |     indexes::Array{NTuple{N,UnitRange{Int}},N}  # indexes held by piece i
 30 |     cuts::Vector{Vector{Int}}                   # cuts[d][i] = first index of chunk i in dimension d
 31 |     localpart::Nullable{A}
 32 | 
 33 |     release::Bool
 34 | 
 35 |     function DArray(id, dims, pids, indexes, cuts, lp)
 36 |         # check invariants
 37 |         if dims != map(last, last(indexes))
 38 |             throw(ArgumentError("dimension of DArray (dim) and indexes do not match"))
 39 |         end
 40 |         release = (myid() == id[1])
 41 | 
 42 |         haskey(registry, id) && return registry[id]
 43 | 
 44 |         d = new(id, dims, pids, indexes, cuts, lp, release)
 45 |         if release
 46 |             push!(refs, id)
 47 |             registry[id] = d
 48 | 
 49 | #            println("Installing finalizer for : ", d.id, ", : ", object_id(d), ", isbits: ", isbits(d))
 50 |             finalizer(d, close)
 51 |         end
 52 |         d
 53 |     end
 54 | 
 55 |     DArray() = new()
 56 | end
 57 | 
 58 | eltype{T}(::Type{DArray{T}}) = T
 59 | empty_localpart(T,N,A) = convert(A, Array{T}(ntuple(zero, N)))
 60 | 
 61 | typealias SubDArray{T,N,D<:DArray} SubArray{T,N,D}
 62 | typealias SubOrDArray{T,N} Union{DArray{T,N}, SubDArray{T,N}}
 63 | 
 64 | localtype{T,N,S}(::Type{DArray{T,N,S}}) = S
 65 | localtype{T,N,D}(::Type{SubDArray{T,N,D}}) = localtype(D)
 66 | localtype(A::SubOrDArray) = localtype(typeof(A))
 67 | localtype(A::AbstractArray) = typeof(A)
 68 | 
 69 | ## core constructors ##
 70 | 
 71 | function DArray(id, init, dims, pids, idxs, cuts)
 72 |     r=Channel(1)
 73 |     @sync begin
 74 |         for i = 1:length(pids)
 75 |             @async begin
 76 |                 local typA
 77 |                 if isa(init, Function)
 78 |                     typA=remotecall_fetch(construct_localparts, pids[i], init, id, dims, pids, idxs, cuts)
 79 |                 else
 80 |                     # constructing from an array of remote refs.
 81 |                     typA=remotecall_fetch(construct_localparts, pids[i], init[i], id, dims, pids, idxs, cuts)
 82 |                 end
 83 |                 !isready(r) && put!(r, typA)
 84 |             end
 85 |         end
 86 |     end
 87 | 
 88 |     A = take!(r)
 89 |     if myid() in pids
 90 |         d = registry[id]
 91 |     else
 92 |         T = eltype(A)
 93 |         N = length(dims)
 94 |         d = DArray{T,N,A}(id, dims, pids, idxs, cuts, empty_localpart(T,N,A))
 95 |     end
 96 |     d
 97 | end
 98 | 
 99 | function construct_localparts(init, id, dims, pids, idxs, cuts; T=nothing, A=nothing)
100 |     localpart = isa(init, Function) ? init(idxs[localpartindex(pids)]) : fetch(init)
101 |     if A == nothing
102 |         A = typeof(localpart)
103 |     end
104 |     if T == nothing
105 |         T = eltype(A)
106 |     end
107 |     N = length(dims)
108 |     d = DArray{T,N,A}(id, dims, pids, idxs, cuts, localpart)
109 |     registry[id] = d
110 |     A
111 | end
112 | 
113 | function ddata(;T::Type=Any, init::Function=I->nothing, pids=workers(), data::Vector=[])
114 |     pids=sort(vec(pids))
115 |     id = next_did()
116 |     npids = length(pids)
117 |     ldata = length(data)
118 |     idxs, cuts = chunk_idxs([npids], [npids])
119 | 
120 |     if ldata > 0
121 |         @assert rem(ldata,npids) == 0
122 |         if ldata == npids
123 |             T = eltype(data)
124 |             s = DestinationSerializer(pididx->data[pididx], pids)
125 |             init = I->localpart(s)
126 |         else
127 |             # call the standard distribute function
128 |             return distribute(data)
129 |         end
130 |     end
131 | 
132 |     @sync for i = 1:length(pids)
133 |         @async remotecall_fetch(construct_localparts, pids[i], init, id, (npids,), pids, idxs, cuts; T=T, A=T)
134 |     end
135 | 
136 |     if myid() in pids
137 |         d = registry[id]
138 |     else
139 |         d = DArray{T,1,T}(id, (npids,), pids, idxs, cuts, Nullable{T}())
140 |     end
141 |     d
142 | end
143 | 
144 | function gather{T}(d::DArray{T,1,T})
145 |     a=Array{T}(length(procs(d)))
146 |     @sync for (i,p) in enumerate(procs(d))
147 |         @async a[i] = remotecall_fetch(localpart, p, d)
148 |     end
149 |     a
150 | end
151 | 
152 | function DArray(init, dims, procs, dist)
153 |     np = prod(dist)
154 |     procs = reshape(procs[1:np], ntuple(i->dist[i], length(dist)))
155 |     idxs, cuts = chunk_idxs([dims...], dist)
156 |     id = next_did()
157 | 
158 |     return DArray(id, init, dims, procs, idxs, cuts)
159 | end
160 | 
161 | function DArray(init, dims, procs)
162 |     if isempty(procs)
163 |         throw(ArgumentError("no processors given"))
164 |     end
165 |     return DArray(init, dims, procs, defaultdist(dims, procs))
166 | end
167 | DArray(init, dims) = DArray(init, dims, workers()[1:min(nworkers(), maximum(dims))])
168 | 
169 | # Create a DArray from a collection of references
170 | # The refs must have the same layout as the parts distributed.
171 | # i.e.
172 | #    size(refs) must specify the distribution of dimensions across processors
173 | #    prod(size(refs)) must equal number of parts
174 | # FIXME : Empty parts are currently not supported.
175 | function DArray(refs)
176 |     dimdist = size(refs)
177 |     id = next_did()
178 | 
179 |     npids = [r.where for r in refs]
180 |     nsizes = Array{Tuple}(dimdist)
181 |     @sync for i in 1:length(refs)
182 |         let i=i
183 |             @async nsizes[i] = remotecall_fetch(sz_localpart_ref, npids[i], refs[i], id)
184 |         end
185 |     end
186 | 
187 |     nindexes = Array{NTuple{length(dimdist),UnitRange{Int}}}(dimdist...)
188 | 
189 |     for i in 1:length(nindexes)
190 |         subidx = ind2sub(dimdist, i)
191 |         nindexes[i] = ntuple(length(subidx)) do x
192 |             idx_in_dim = subidx[x]
193 |             startidx = 1
194 |             for j in 1:(idx_in_dim-1)
195 |                 prevsubidx = ntuple(y -> y == x ? j : subidx[y], length(subidx))
196 |                 prevsize = nsizes[prevsubidx...]
197 |                 startidx += prevsize[x]
198 |             end
199 |             startidx:startidx+(nsizes[i][x])-1
200 |         end
201 |     end
202 | 
203 |     lastidxs = hcat([Int[last(idx_in_d)+1 for idx_in_d in idx] for idx in nindexes]...)
204 |     ncuts = Array{Int,1}[unshift!(sort(unique(lastidxs[x,:])), 1) for x in 1:length(dimdist)]
205 |     ndims = tuple([sort(unique(lastidxs[x,:]))[end]-1 for x in 1:length(dimdist)]...)
206 | 
207 |     DArray(id, refs, ndims, reshape(npids, dimdist), nindexes, ncuts)
208 | end
209 | 
210 | macro DArray(ex0::Expr)
211 |     if ex0.head !== :comprehension
212 |         throw(ArgumentError("invalid @DArray syntax"))
213 |     end
214 |     ex = ex0.args[1]
215 |     if ex.head !== :generator
216 |         throw(ArgumentError("invalid @DArray syntax"))
217 |     end
218 |     ex.args[1] = esc(ex.args[1])
219 |     ndim = length(ex.args) - 1
220 |     ranges = map(r->esc(r.args[2]), ex.args[2:end])
221 |     for d = 1:ndim
222 |         var = ex.args[d+1].args[1]
223 |         ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )
224 |     end
225 |     return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0),
226 |                 tuple($(map(r->:(length($r)), ranges)...))) )
227 | end
228 | 
229 | # new DArray similar to an existing one
230 | DArray(init, d::DArray) = DArray(next_did(), init, size(d), procs(d), d.indexes, d.cuts)
231 | 
232 | sz_localpart_ref(ref, id) = size(fetch(ref))
233 | 
234 | Base.similar(d::DArray, T::Type, dims::Dims) = DArray(I->Array{T}(map(length,I)), dims, procs(d))
235 | Base.similar(d::DArray, T::Type) = similar(d, T, size(d))
236 | Base.similar{T}(d::DArray{T}, dims::Dims) = similar(d, T, dims)
237 | Base.similar{T}(d::DArray{T}) = similar(d, T, size(d))
238 | 
239 | Base.size(d::DArray) = d.dims
240 | 
241 | chunktype{T,N,A}(d::DArray{T,N,A}) = A
242 | 
243 | ## chunk index utilities ##
244 | 
245 | # decide how to divide each dimension
246 | # returns size of chunks array
247 | function defaultdist(dims, pids)
248 |     dims = [dims...]
249 |     chunks = ones(Int, length(dims))
250 |     np = length(pids)
251 |     f = sort!(collect(keys(factor(np))), rev=true)
252 |     k = 1
253 |     while np > 1
254 |         # repeatedly allocate largest factor to largest dim
255 |         if np % f[k] != 0
256 |             k += 1
257 |             if k > length(f)
258 |                 break
259 |             end
260 |         end
261 |         fac = f[k]
262 |         (d, dno) = findmax(dims)
263 |         # resolve ties to highest dim
264 |         dno = last(find(dims .== d))
265 |         if dims[dno] >= fac
266 |             dims[dno] = div(dims[dno], fac)
267 |             chunks[dno] *= fac
268 |         end
269 |         np = div(np, fac)
270 |     end
271 |     return chunks
272 | end
273 | 
274 | # get array of start indexes for dividing sz into nc chunks
275 | function defaultdist(sz::Int, nc::Int)
276 |     if sz >= nc
277 |         return round.(Int, linspace(1, sz+1, nc+1))
278 |     else
279 |         return [[1:(sz+1);], zeros(Int, nc-sz);]
280 |     end
281 | end
282 | 
283 | # compute indexes array for dividing dims into chunks
284 | function chunk_idxs(dims, chunks)
285 |     cuts = map(defaultdist, dims, chunks)
286 |     n = length(dims)
287 |     idxs = Array{NTuple{n,UnitRange{Int}}}(chunks...)
288 |     for cidx in CartesianRange(tuple(chunks...))
289 |         idxs[cidx.I...] = ntuple(i -> (cuts[i][cidx[i]]:cuts[i][cidx[i] + 1] - 1), n)
290 |     end
291 |     return (idxs, cuts)
292 | end
293 | 
294 | function localpartindex(pids::Array{Int})
295 |     mi = myid()
296 |     for i = 1:length(pids)
297 |         if pids[i] == mi
298 |             return i
299 |         end
300 |     end
301 |     return 0
302 | end
303 | localpartindex(d::DArray) = localpartindex(procs(d))
304 | 
305 | """
306 |     localpart(d::DArray)
307 | 
308 | Get the local piece of a distributed array.
309 | Returns an empty array if no local part exists on the calling process.
310 | 
311 | d[:L], d[:l], d[:LP], d[:lp] are an alternative means to get localparts.
312 | This syntaxt can also be used for assignment. For example,
313 | `d[:L]=v` will assign `v` to the localpart of `d`.
314 | """
315 | function localpart{T,N,A}(d::DArray{T,N,A})
316 |     lpidx = localpartindex(d)
317 |     if lpidx == 0
318 |         return empty_localpart(T,N,A)::A
319 |     end
320 | 
321 |     return get(registry[d.id].localpart)::A
322 | end
323 | 
324 | localpart(d::DArray, localidx...) = localpart(d)[localidx...]
325 | 
326 | # shortcut to set/get localparts of a distributed object
327 | function Base.getindex(d::DArray, s::Symbol)
328 |     @assert s in [:L, :l, :LP, :lp]
329 |     return localpart(d)
330 | end
331 | 
332 | function Base.setindex!{T,N,A}(d::DArray{T,N,A}, new_lp::A, s::Symbol)
333 |     @assert s in [:L, :l, :LP, :lp]
334 |     d.localpart = new_lp
335 |     new_lp
336 | end
337 | 
338 | 
339 | # fetch localpart of d at pids[i]
340 | fetch{T,N,A}(d::DArray{T,N,A}, i) = remotecall_fetch(localpart, d.pids[i], d)
341 | 
342 | """
343 |     localindexes(d)
344 | 
345 | A tuple describing the indexes owned by the local process.
346 | Returns a tuple with empty ranges if no local part exists on the calling process.
347 | """
348 | function localindexes(d::DArray)
349 |     lpidx = localpartindex(d)
350 |     if lpidx == 0
351 |         return ntuple(i -> 1:0, ndims(d))
352 |     end
353 |     return d.indexes[lpidx]
354 | end
355 | 
356 | # find which piece holds index (I...)
357 | locate(d::DArray, I::Int...) =
358 |     ntuple(i -> searchsortedlast(d.cuts[i], I[i]), ndims(d))
359 | 
360 | chunk{T,N,A}(d::DArray{T,N,A}, i...) = remotecall_fetch(localpart, d.pids[i...], d)::A
361 | 
362 | ## convenience constructors ##
363 | 
364 | """
365 |      dzeros(dims, ...)
366 | 
367 | Construct a distributed array of zeros.
368 | Trailing arguments are the same as those accepted by `DArray`.
369 | """
370 | dzeros(dims::Dims, args...) = DArray(I->zeros(map(length,I)), dims, args...)
371 | dzeros{T}(::Type{T}, dims::Dims, args...) = DArray(I->zeros(T,map(length,I)), dims, args...)
372 | dzeros{T}(::Type{T}, d1::Integer, drest::Integer...) = dzeros(T, convert(Dims, tuple(d1, drest...)))
373 | dzeros(d1::Integer, drest::Integer...) = dzeros(Float64, convert(Dims, tuple(d1, drest...)))
374 | dzeros(d::Dims) = dzeros(Float64, d)
375 | 
376 | 
377 | """
378 |     dones(dims, ...)
379 | 
380 | Construct a distributed array of ones.
381 | Trailing arguments are the same as those accepted by `DArray`.
382 | """
383 | dones(dims::Dims, args...) = DArray(I->ones(map(length,I)), dims, args...)
384 | dones{T}(::Type{T}, dims::Dims, args...) = DArray(I->ones(T,map(length,I)), dims, args...)
385 | dones{T}(::Type{T}, d1::Integer, drest::Integer...) = dones(T, convert(Dims, tuple(d1, drest...)))
386 | dones(d1::Integer, drest::Integer...) = dones(Float64, convert(Dims, tuple(d1, drest...)))
387 | dones(d::Dims) = dones(Float64, d)
388 | 
389 | """
390 |      dfill(x, dims, ...)
391 | 
392 | Construct a distributed array filled with value `x`.
393 | Trailing arguments are the same as those accepted by `DArray`.
394 | """
395 | dfill(v, dims::Dims, args...) = DArray(I->fill(v, map(length,I)), dims, args...)
396 | dfill(v, d1::Integer, drest::Integer...) = dfill(v, convert(Dims, tuple(d1, drest...)))
397 | 
398 | """
399 |      drand(dims, ...)
400 | 
401 | Construct a distributed uniform random array.
402 | Trailing arguments are the same as those accepted by `DArray`.
403 | """
404 | drand(r, dims::Dims, args...) = DArray(I -> rand(r, map(length,I)), dims, args...)
405 | drand(r, d1::Integer, drest::Integer...) = drand(r, convert(Dims, tuple(d1, drest...)))
406 | drand(d1::Integer, drest::Integer...) = drand(Float64, convert(Dims, tuple(d1, drest...)))
407 | drand(d::Dims, args...)  = drand(Float64, d, args...)
408 | 
409 | """
410 |      drandn(dims, ...)
411 | 
412 | Construct a distributed normal random array.
413 | Trailing arguments are the same as those accepted by `DArray`.
414 | """
415 | drandn(dims::Dims, args...) = DArray(I->randn(map(length,I)), dims, args...)
416 | drandn(d1::Integer, drest::Integer...) = drandn(convert(Dims, tuple(d1, drest...)))
417 | 
418 | ## conversions ##
419 | 
420 | """
421 |      distribute(A[; procs, dist])
422 | 
423 | Convert a local array to distributed.
424 | 
425 | `procs` optionally specifies an array of process IDs to use. (defaults to all workers)
426 | `dist` optionally specifies a vector or tuple of the number of partitions in each dimension
427 | """
428 | function distribute(A::AbstractArray;
429 |     procs = workers()[1:min(nworkers(), maximum(size(A)))],
430 |     dist = defaultdist(size(A), procs))
431 |     np = prod(dist)
432 |     procs_used = procs[1:np]
433 |     idxs, _ = chunk_idxs([size(A)...], dist)
434 | 
435 |     s = verified_destination_serializer(reshape(procs_used, size(idxs)), size(idxs)) do pididx
436 |         A[idxs[pididx]...]
437 |     end
438 |     return DArray(I->localpart(s), size(A), procs_used, dist)
439 | end
440 | 
441 | """
442 |     distribute(A, DA)
443 | 
444 | Distribute a local array `A` like the distributed array `DA`.
445 | 
446 | """
447 | function distribute(A::AbstractArray, DA::DArray)
448 |     size(DA) == size(A) || throw(DimensionMismatch("Distributed array has size $(size(DA)) but array has $(size(A))"))
449 | 
450 |     s = verified_destination_serializer(procs(DA), size(DA.indexes)) do pididx
451 |         A[DA.indexes[pididx]...]
452 |     end
453 |     return DArray(I->localpart(s), DA)
454 | end
455 | 
456 | Base.convert{T,N,S<:AbstractArray}(::Type{DArray{T,N,S}}, A::S) = distribute(convert(AbstractArray{T,N}, A))
457 | 
458 | Base.convert{S,T,N}(::Type{Array{S,N}}, d::DArray{T,N}) = begin
459 |     a = Array{S}(size(d))
460 |     @sync begin
461 |         for i = 1:length(d.pids)
462 |             @async a[d.indexes[i]...] = chunk(d, i)
463 |         end
464 |     end
465 |     return a
466 | end
467 | 
468 | Base.convert{S,T,N}(::Type{Array{S,N}}, s::SubDArray{T,N}) = begin
469 |     I = s.indexes
470 |     d = s.parent
471 |     if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S
472 |         l = locate(d, map(first, I)...)
473 |         if isequal(d.indexes[l...], I)
474 |             # SubDArray corresponds to a chunk
475 |             return chunk(d, l...)
476 |         end
477 |     end
478 |     a = Array{S}(size(s))
479 |     a[[1:size(a,i) for i=1:N]...] = s
480 |     return a
481 | end
482 | 
483 | function Base.convert{T,N}(::Type{DArray}, SD::SubArray{T,N})
484 |     D = SD.parent
485 |     DArray(size(SD), procs(D)) do I
486 |         TR = typeof(SD.indexes[1])
487 |         lindices = Array{TR}(0)
488 |         for (i,r) in zip(I, SD.indexes)
489 |             st = step(r)
490 |             lrstart = first(r) + st*(first(i)-1)
491 |             lrend = first(r) + st*(last(i)-1)
492 |             if TR <: UnitRange
493 |                 push!(lindices, lrstart:lrend)
494 |             else
495 |                 push!(lindices, lrstart:st:lrend)
496 |             end
497 |         end
498 |         convert(Array, D[lindices...])
499 |     end
500 | end
501 | 
502 | Base.reshape{T,S<:Array}(A::DArray{T,1,S}, d::Dims) = begin
503 |     if prod(d) != length(A)
504 |         throw(DimensionMismatch("dimensions must be consistent with array size"))
505 |     end
506 |     return DArray(d) do I
507 |         sz = map(length,I)
508 |         d1offs = first(I[1])
509 |         nd = length(I)
510 | 
511 |         B = Array{T}(sz)
512 |         nr = size(B,1)
513 |         sztail = size(B)[2:end]
514 | 
515 |         for i=1:div(length(B),nr)
516 |             i2 = ind2sub(sztail, i)
517 |             globalidx = [ I[j][i2[j-1]] for j=2:nd ]
518 | 
519 |             a = sub2ind(d, d1offs, globalidx...)
520 | 
521 |             B[:,i] = A[a:(a+nr-1)]
522 |         end
523 |         B
524 |     end
525 | end
526 | 
527 | ## indexing ##
528 | 
529 | getlocalindex(d::DArray, idx...) = localpart(d)[idx...]
530 | function getindex_tuple{T}(d::DArray{T}, I::Tuple{Vararg{Int}})
531 |     chidx = locate(d, I...)
532 |     idxs = d.indexes[chidx...]
533 |     localidx = ntuple(i -> (I[i] - first(idxs[i]) + 1), ndims(d))
534 |     pid = d.pids[chidx...]
535 |     return remotecall_fetch(getlocalindex, pid, d, localidx...)::T
536 | end
537 | 
538 | Base.getindex(d::DArray, i::Int) = getindex_tuple(d, ind2sub(size(d), i))
539 | Base.getindex(d::DArray, i::Int...) = getindex_tuple(d, i)
540 | 
541 | Base.getindex(d::DArray) = d[1]
542 | Base.getindex(d::DArray, I::Union{Int,UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...) = view(d, I...)
543 | 
544 | 
545 | Base.copy!(dest::SubOrDArray, src::SubOrDArray) = begin
546 |     asyncmap(procs(dest)) do p
547 |         remotecall_fetch(p) do
548 |             localpart(dest)[:] = src[localindexes(dest)...]
549 |         end
550 |     end
551 |     return dest
552 | end
553 | 
554 | # local copies are obtained by convert(Array, ) or assigning from
555 | # a SubDArray to a local Array.
556 | 
557 | function Base.setindex!(a::Array, d::DArray,
558 |         I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
559 |     n = length(I)
560 |     @sync for i = 1:length(d.pids)
561 |         K = d.indexes[i]
562 |         @async a[[I[j][K[j]] for j=1:n]...] = chunk(d, i)
563 |     end
564 |     return a
565 | end
566 | 
567 | # We also want to optimize setindex! with a SubDArray source, but this is hard
568 | # and only works on 0.5.
569 | 
570 | # Similar to Base.indexin, but just create a logical mask. Note that this
571 | # must return a logical mask in order to support merging multiple masks
572 | # together into one linear index since we need to know how many elements to
573 | # skip at the end. In many cases range intersection would be much faster
574 | # than generating a logical mask, but that loses the endpoint information.
575 | indexin_mask(a, b::Number) = a .== b
576 | indexin_mask(a, r::Range{Int}) = [i in r for i in a]
577 | indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
578 | indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
579 | indexin_mask(a, b) = [i in b for i in a]
580 | 
581 | import Base: tail
582 | # Given a tuple of indices and a tuple of masks, restrict the indices to the
583 | # valid regions. This is, effectively, reversing Base.setindex_shape_check.
584 | # We can't just use indexing into MergedIndices here because getindex is much
585 | # pickier about singleton dimensions than setindex! is.
586 | restrict_indices(::Tuple{}, ::Tuple{}) = ()
587 | function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
588 |     if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
589 |         (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
590 |     elseif length(a[1]) == 1
591 |         (a[1], restrict_indices(tail(a), b))
592 |     elseif length(b[1]) == 1 && b[1][1]
593 |         restrict_indices(a, tail(b))
594 |     else
595 |         throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
596 |     end
597 | end
598 | # The final indices are funky - they're allowed to accumulate together.
599 | # An easy (albeit very inefficient) fix for too many masks is to use the
600 | # outer product to merge them. But we can do that lazily with a custom type:
601 | function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
602 |     (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
603 | end
604 | # But too many indices is much harder; this requires merging the indices
605 | # in `a` before applying the final mask in `b`.
606 | function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
607 |     if length(a[1]) == 1
608 |         (a[1], restrict_indices(tail(a), b))
609 |     else
610 |         # When one mask spans multiple indices, we need to merge the indices
611 |         # together. At this point, we can just use indexing to merge them since
612 |         # there's no longer special handling of singleton dimensions
613 |         (view(MergedIndices(a, map(length, a)), b[1]),)
614 |     end
615 | end
616 | 
617 | immutable ProductIndices{I,N} <: AbstractArray{Bool, N}
618 |     indices::I
619 |     sz::NTuple{N,Int}
620 | end
621 | Base.size(P::ProductIndices) = P.sz
622 | # This gets passed to map to avoid breaking propagation of inbounds
623 | Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
624 | Base.@propagate_inbounds Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) =
625 |     Bool((&)(map(propagate_getindex, P.indices, I)...))
626 | 
627 | immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
628 |     indices::I
629 |     sz::NTuple{N,Int}
630 | end
631 | Base.size(M::MergedIndices) = M.sz
632 | Base.@propagate_inbounds Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) =
633 |     CartesianIndex(map(propagate_getindex, M.indices, I))
634 | # Additionally, we optimize bounds checking when using MergedIndices as an
635 | # array index since checking, e.g., A[1:500, 1:500] is *way* faster than
636 | # checking an array of 500^2 elements of CartesianIndex{2}. This optimization
637 | # also applies to reshapes of MergedIndices since the outer shape of the
638 | # container doesn't affect the index elements themselves. We can go even
639 | # farther and say that even restricted views of MergedIndices must be valid
640 | # over the entire array. This is overly strict in general, but in this
641 | # use-case all the merged indices must be valid at some point, so it's ok.
642 | typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
643 | typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
644 | typealias MergedIndicesOrSub Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
645 | import Base: checkbounds_indices
646 | @inline checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
647 |     checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
648 | @inline checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
649 |     checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
650 | @inline checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
651 |     checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
652 | 
653 | # The tricky thing here is that we want to optimize the accesses into the
654 | # distributed array, but in doing so, we lose track of which indices in I we
655 | # should be using.
656 | #
657 | # I’ve come to the conclusion that the function is utterly insane.
658 | # There are *6* flavors of indices with four different reference points:
659 | # 1. Find the indices of each portion of the DArray.
660 | # 2. Find the valid subset of indices for the SubArray into that portion.
661 | # 3. Find the portion of the `I` indices that should be used when you access the
662 | #    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
663 | #    from all other arrays, wherein we simply iterate over the source array’s
664 | #    elements.  You need to *both* know which elements in `J` were skipped
665 | #    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
666 | # 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
667 | #    the local portion of the source array
668 | function Base.setindex!(a::Array, s::SubDArray,
669 |         I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
670 |     Inew = Base.to_indices(a, I)
671 |     Base.setindex_shape_check(s, Base.index_lengths(Inew...)...)
672 |     n = length(Inew)
673 |     d = s.parent
674 |     J = Base.to_indices(d, s.indexes)
675 |     @sync for i = 1:length(d.pids)
676 |         K_c = d.indexes[i]
677 |         K = map(intersect, J, K_c)
678 |         if !any(isempty, K)
679 |             K_mask = map(indexin_mask, J, K_c)
680 |             idxs = restrict_indices(Inew, K_mask)
681 |             if isequal(K, K_c)
682 |                 # whole chunk
683 |                 @async a[idxs...] = chunk(d, i)
684 |             else
685 |                 # partial chunk
686 |                 @async a[idxs...] =
687 |                     remotecall_fetch(d.pids[i]) do
688 |                         view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
689 |                     end
690 |             end
691 |         end
692 |     end
693 |     return a
694 | end
695 | 
696 | Base.fill!(A::DArray, x) = begin
697 |     @sync for p in procs(A)
698 |         @async remotecall_fetch((A,x)->(fill!(localpart(A), x); nothing), p, A, x)
699 |     end
700 |     return A
701 | end
702 | 


--------------------------------------------------------------------------------
/test/darray.jl:
--------------------------------------------------------------------------------
  1 | t=@testset "test distribute" begin
  2 |     A = rand(1:100, (100,100))
  3 | 
  4 |     @testset "test default distribute" begin
  5 |         DA = distribute(A)
  6 |         @test length(procs(DA)) == nworkers()
  7 |         @test sum(DA) == sum(A)
  8 |         close(DA)
  9 |     end
 10 | 
 11 |     @testset "test distribute with procs arguments" begin
 12 |         DA = distribute(A, procs = procs())
 13 |         @test length(procs(DA)) == nprocs()
 14 |         @test sum(DA) == sum(A)
 15 |         close(DA)
 16 |     end
 17 | 
 18 |     @testset "test distribute with procs and dist arguments" begin
 19 |         DA = distribute(A, procs = [1, 2], dist = [1,2])
 20 |         @test size(procs(DA)) == (1,2)
 21 |         @test sum(DA) == sum(A)
 22 |         close(DA)
 23 |     end
 24 | 
 25 |     @testset "Create darray with unconventional distribution and distibute like it" begin
 26 |         block = 10
 27 |         Y = nworkers() * block
 28 |         X = nworkers() * block
 29 |         remote_parts = map(workers()) do wid
 30 |             remotecall(rand, wid, block, Y)
 31 |         end
 32 |         DA1 = DArray(reshape(remote_parts, (length(remote_parts), 1)))
 33 |         A = rand(X, Y)
 34 |         DA2 = distribute(A, DA1)
 35 | 
 36 |         @test size(DA1) == size(DA2)
 37 | 
 38 |         close(DA1)
 39 |         close(DA2)
 40 |     end
 41 | end
 42 | 
 43 | check_leaks(t)
 44 | 
 45 | t=@testset "test DArray equality" begin
 46 |     D = drand((200,200), [MYID, OTHERIDS])
 47 |     DC = copy(D)
 48 | 
 49 |     @testset "test isequal(::DArray, ::DArray)" begin
 50 |         @test D == DC
 51 |     end
 52 | 
 53 |     @testset "test copy(::DArray) does a copy of each localpart" begin
 54 |         @spawnat OTHERIDS localpart(DC)[1] = 0
 55 |         @test fetch(@spawnat OTHERIDS localpart(D)[1] != 0)
 56 |     end
 57 | 
 58 |     close(D)
 59 |     close(DC)
 60 | end
 61 | 
 62 | check_leaks(t)
 63 | 
 64 | t=@testset "test DArray similar" begin
 65 |     D = drand((200,200), [MYID, OTHERIDS])
 66 |     DS = similar(D,Float16)
 67 | 
 68 |     @testset "test eltype of a similar" begin
 69 |         @test eltype(DS) == Float16
 70 |     end
 71 | 
 72 |     @testset "test dims of a similar" begin
 73 |         @test size(D) == size(DS)
 74 |     end
 75 |     close(D)
 76 |     close(DS)
 77 | end
 78 | 
 79 | check_leaks(t)
 80 | 
 81 | t=@testset "test DArray reshape" begin
 82 |     D = drand((200,200), [MYID, OTHERIDS])
 83 | 
 84 |     @testset "Test error-throwing in reshape" begin
 85 |         @test_throws DimensionMismatch reshape(D,(100,100))
 86 |     end
 87 | 
 88 |     DR = reshape(D,(100,400))
 89 |     @testset "Test reshape" begin
 90 |         @test size(DR) == (100,400)
 91 |     end
 92 |     close(D)
 93 | end
 94 | 
 95 | check_leaks(t)
 96 | 
 97 | t=@testset "test @DArray comprehension constructor" begin
 98 | 
 99 |     @testset "test valid use of @DArray" begin
100 |         D = @DArray [i+j for i=1:10, j=1:10]
101 |         @test D == [i+j for i=1:10, j=1:10]
102 |         close(D)
103 |     end
104 | 
105 |     @testset "test invalid use of @DArray" begin
106 |         @test_throws ArgumentError eval(:((@DArray [1,2,3,4])))
107 |     end
108 | end
109 | 
110 | check_leaks(t)
111 | 
112 | t=@testset "test DArray / Array conversion" begin
113 |     D = drand((200,200), [MYID, OTHERIDS])
114 | 
115 |     @testset "test convert(::Array, ::(Sub)DArray)" begin
116 |         S = convert(Matrix{Float64}, D[1:150, 1:150])
117 |         A = convert(Matrix{Float64}, D)
118 | 
119 |         @test A[1:150,1:150] == S
120 |         D2 = convert(DArray{Float64,2,Matrix{Float64}}, A)
121 |         @test D2 == D
122 |         @test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1]
123 |         @test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101]
124 |         close(D2)
125 | 
126 |         S2 = convert(Vector{Float64}, D[4, 23:176])
127 |         @test A[4, 23:176] == S2
128 | 
129 |         S3 = convert(Vector{Float64}, D[23:176, 197])
130 |         @test A[23:176, 197] == S3
131 | 
132 |         S4 = zeros(4)
133 |         setindex!(S4, D[3:4, 99:100], :)
134 |         @test S4 == vec(D[3:4, 99:100])
135 |         @test S4 == vec(A[3:4, 99:100])
136 | 
137 |         S5 = zeros(2,2)
138 |         setindex!(S5, D[1,1:4], :, 1:2)
139 |         @test vec(S5) == D[1, 1:4]
140 |         @test vec(S5) == A[1, 1:4]
141 |     end
142 |     close(D)
143 | end
144 | 
145 | check_leaks(t)
146 | 
147 | t=@testset "copy!" begin
148 |     D1 = dzeros((10,10))
149 |     r1 = remotecall_wait(() -> randn(3,10), workers()[1])
150 |     r2 = remotecall_wait(() -> randn(7,10), workers()[2])
151 |     D2 = DArray(reshape([r1; r2], 2, 1))
152 |     copy!(D2, D1)
153 |     @test D1 == D2
154 |     close(D1)
155 |     close(D2)
156 | end
157 | 
158 | check_leaks(t)
159 | 
160 | t=@testset "test DArray reduce" begin
161 |     D = DArray(id->fill(myid(), map(length,id)), (10,10), [MYID, OTHERIDS])
162 | 
163 |     @testset "test reduce" begin
164 |         @test reduce(+, D) == ((50*MYID) + (50*OTHERIDS))
165 |     end
166 | 
167 |     @testset "test map / reduce" begin
168 |         D2 = map(x->1, D)
169 |         @test reduce(+, D2) == 100
170 |         close(D2)
171 |     end
172 | 
173 |     @testset "test map! / reduce" begin
174 |         map!(x->1, D, D)
175 |         @test reduce(+, D) == 100
176 |     end
177 |     close(D)
178 | end
179 | 
180 | check_leaks(t)
181 | 
182 | t=@testset "test scale" begin
183 |     A = randn(100,100)
184 |     DA = distribute(A)
185 |     @test scale!(DA, 2) == scale!(A, 2)
186 |     close(DA)
187 | end
188 | 
189 | check_leaks(t)
190 | 
191 | t=@testset "test scale!(b, A)" begin
192 |     A = randn(100, 100)
193 |     b = randn(100)
194 |     DA = distribute(A)
195 |     @test scale!(b, A) == scale!(b, DA)
196 |     close(DA)
197 |     A = randn(100, 100)
198 |     b = randn(100)
199 |     DA = distribute(A)
200 |     @test scale!(A, b) == scale!(DA, b)
201 |     close(DA)
202 | end
203 | 
204 | check_leaks(t)
205 | 
206 | t=@testset "test mapreduce on DArrays" begin
207 |     for _ = 1:25, f = [x -> Int128(2x), x -> Int128(x^2), x -> Int128(x^2 + 2x - 1)], opt = [+, *]
208 |         A = rand(1:5, rand(2:30))
209 |         DA = distribute(A)
210 |         @test mapreduce(f, opt, DA) - mapreduce(f, opt, A) == 0
211 |         close(DA)
212 |     end
213 | end
214 | 
215 | check_leaks(t)
216 | 
217 | t=@testset "test mapreducedim on DArrays" begin
218 |     D = DArray(I->fill(myid(), map(length,I)), (73,73), [MYID, OTHERIDS])
219 |     D2 = map(x->1, D)
220 |     @test mapreducedim(t -> t*t, +, D2, 1) == mapreducedim(t -> t*t, +, convert(Array, D2), 1)
221 |     @test mapreducedim(t -> t*t, +, D2, 2) == mapreducedim(t -> t*t, +, convert(Array, D2), 2)
222 |     @test mapreducedim(t -> t*t, +, D2, (1,2)) == mapreducedim(t -> t*t, +, convert(Array, D2), (1,2))
223 | 
224 |     # Test non-regularly chunked DArrays
225 |     r1 = DistributedArrays.remotecall(() -> sprandn(3, 10, 0.1), workers()[1])
226 |     r2 = DistributedArrays.remotecall(() -> sprandn(7, 10, 0.1), workers()[2])
227 |     D = DArray(reshape([r1; r2], (2,1)))
228 |     @test Array(sum(D, 2)) == sum(Array(D), 2)
229 | 
230 |     # close(D)
231 |     # close(D2)
232 |     d_closeall()   # temp created by the mapreduce above
233 | end
234 | 
235 | check_leaks(t)
236 | 
237 | t=@testset "test mapreducdim, reducedim on DArrays" begin
238 |     dims = (20,20,20)
239 |     DA = drandn(dims)
240 |     A = convert(Array, DA)
241 | 
242 |     @testset "dimension $dms" for dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))
243 |         @test mapreducedim(t -> t*t, +, A, dms) ≈ mapreducedim(t -> t*t, +, DA, dms)
244 |         @test mapreducedim(t -> t*t, +, A, dms, 1.0) ≈ mapreducedim(t -> t*t, +, DA, dms, 1.0)
245 |         @test reducedim(*, A, dms) ≈ reducedim(*, DA, dms)
246 |         @test reducedim(*, A, dms, 2.0) ≈ reducedim(*, DA, dms, 2.0)
247 |     end
248 |     close(DA)
249 |     d_closeall()   # temp created by the mapreduce above
250 | end
251 | 
252 | check_leaks(t)
253 | 
254 | t=@testset "test statistical functions on DArrays" begin
255 |     dims = (20,20,20)
256 |     DA = drandn(dims)
257 |     A = convert(Array, DA)
258 | 
259 |     @testset "test $f for dimension $dms" for f in (mean, ), dms in (1, 2, 3, (1,2), (1,3), (2,3), (1,2,3))
260 |         # std is pending implementation
261 |         @test f(DA,dms) ≈ f(A,dms)
262 |     end
263 | 
264 |     close(DA)
265 |     d_closeall()   # temporaries created above
266 | end
267 | 
268 | check_leaks(t)
269 | 
270 | t=@testset "test sum on DArrays" begin
271 |     A = randn(100,100)
272 |     DA = distribute(A)
273 | 
274 |     # sum either throws an ArgumentError or a CompositeException of ArgumentErrors
275 |     try
276 |         sum(DA, -1)
277 |     catch err
278 |         if isa(err, CompositeException)
279 |             @test !isempty(err.exceptions)
280 |             for excep in err.exceptions
281 |                 # Unpack the remote exception
282 |                 orig_err = excep.ex.captured.ex
283 |                 @test isa(orig_err, ArgumentError)
284 |             end
285 |         else
286 |             @test isa(err, ArgumentError)
287 |         end
288 |     end
289 |     try
290 |         sum(DA, 0)
291 |     catch err
292 |         if isa(err, CompositeException)
293 |             @test !isempty(err.exceptions)
294 |             for excep in err.exceptions
295 |                 # Unpack the remote exception
296 |                 orig_err = excep.ex.captured.ex
297 |                 @test isa(orig_err, ArgumentError)
298 |             end
299 |         else
300 |             @test isa(err, ArgumentError)
301 |         end
302 |     end
303 | 
304 |     @test sum(DA) ≈ sum(A)
305 |     @test sum(DA,1) ≈ sum(A,1)
306 |     @test sum(DA,2) ≈ sum(A,2)
307 |     @test sum(DA,3) ≈ sum(A,3)
308 |     close(DA)
309 |     d_closeall()   # temporaries created above
310 | end
311 | 
312 | check_leaks(t)
313 | 
314 | t=@testset "test size on DArrays" begin
315 | 
316 |     A = randn(100,100)
317 |     DA = distribute(A)
318 | 
319 |     @test_throws BoundsError size(DA, 0)
320 |     @test size(DA,1) == size(A,1)
321 |     @test size(DA,2) == size(A,2)
322 |     @test size(DA,3) == size(A,3)
323 |     close(DA)
324 | end
325 | 
326 | check_leaks(t)
327 | 
328 | # test length / endof
329 | t=@testset "test collections API" begin
330 |     A = randn(23,23)
331 |     DA = distribute(A)
332 | 
333 |     @testset "test length" begin
334 |         @test length(DA) == length(A)
335 |     end
336 | 
337 |     @testset "test endof" begin
338 |         @test endof(DA) == endof(A)
339 |     end
340 |     close(DA)
341 | end
342 | 
343 | check_leaks(t)
344 | 
345 | t=@testset "test max / min / sum" begin
346 |     a = map(x -> Int(round(rand() * 100)) - 50, Array{Int}(100,1000))
347 |     d = distribute(a)
348 | 
349 |     @test sum(d)          == sum(a)
350 |     @test maximum(d)      == maximum(a)
351 |     @test minimum(d)      == minimum(a)
352 |     @test maximum(abs, d) == maximum(abs, a)
353 |     @test minimum(abs, d) == minimum(abs, a)
354 |     @test sum(abs, d)     == sum(abs, a)
355 |     @test sum(abs2, d)    == sum(abs2, a)
356 |     close(d)
357 | end
358 | 
359 | check_leaks(t)
360 | 
361 | t=@testset "test all / any" begin
362 |     a = map(x->Int(round(rand() * 100)) - 50, Array{Int}(100,1000))
363 |     a = [true for i in 1:100]
364 |     d = distribute(a)
365 | 
366 |     @test all(d)
367 |     @test any(d)
368 | 
369 |     close(d)
370 | 
371 |     a[50] = false
372 |     d = distribute(a)
373 |     @test !all(d)
374 |     @test any(d)
375 | 
376 |     close(d)
377 | 
378 |     a = [false for i in 1:100]
379 |     d = distribute(a)
380 |     @test !all(d)
381 |     @test !any(d)
382 | 
383 |     close(d)
384 | 
385 |     d = dones(10,10)
386 |     @test !all(x-> x>1.0, d)
387 |     @test all(x-> x>0.0, d)
388 | 
389 |     close(d)
390 | 
391 |     a = ones(10,10)
392 |     a[10] = 2.0
393 |     d = distribute(a)
394 |     @test any(x-> x == 1.0, d)
395 |     @test any(x-> x == 2.0, d)
396 |     @test !any(x-> x == 3.0, d)
397 | 
398 |     close(d)
399 | end
400 | 
401 | check_leaks(t)
402 | 
403 | t=@testset "test count"  begin
404 |     a = ones(10,10)
405 |     a[10] = 2.0
406 |     d = distribute(a)
407 | 
408 |     @test count(x-> x == 2.0, d) == 1
409 |     @test count(x-> x == 1.0, d) == 99
410 |     @test count(x-> x == 0.0, d) == 0
411 | 
412 |     close(d)
413 | end
414 | 
415 | check_leaks(t)
416 | 
417 | t=@testset "test prod" begin
418 |     a = fill(2, 10);
419 |     d = distribute(a);
420 |     @test prod(d) == 2^10
421 | 
422 |     close(d)
423 | end
424 | 
425 | check_leaks(t)
426 | 
427 | t=@testset "test zeros" begin
428 |     @testset "1D dzeros default element type" begin
429 |         A = dzeros(10)
430 |         @test A == zeros(10)
431 |         @test eltype(A) == Float64
432 |         @test size(A) == (10,)
433 |         close(A)
434 |     end
435 | 
436 |     @testset "1D dzeros with specified element type" begin
437 |         A = dzeros(Int, 10)
438 |         @test A == zeros(10)
439 |         @test eltype(A) == Int
440 |         @test size(A) == (10,)
441 |         close(A)
442 |     end
443 | 
444 |     @testset "2D dzeros default element type, Dims constuctor" begin
445 |         A = dzeros((10,10))
446 |         @test A == zeros((10,10))
447 |         @test eltype(A) == Float64
448 |         @test size(A) == (10,10)
449 |         close(A)
450 |     end
451 | 
452 |     @testset "2D dzeros specified element type, Dims constructor" begin
453 |         A = dzeros(Int, (10,10))
454 |         @test A == zeros(Int, (10,10))
455 |         @test eltype(A) == Int
456 |         @test size(A) == (10,10)
457 |         close(A)
458 |     end
459 | 
460 |     @testset "2D dzeros, default element type" begin
461 |         A = dzeros(10,10)
462 |         @test A == zeros(10,10)
463 |         @test eltype(A) == Float64
464 |         @test size(A) == (10,10)
465 |         close(A)
466 |     end
467 | 
468 |     @testset "2D dzeros, specified element type" begin
469 |         A = dzeros(Int, 10, 10)
470 |         @test A == zeros(Int, 10, 10)
471 |         @test eltype(A) == Int
472 |         @test size(A) == (10,10)
473 |         close(A)
474 |     end
475 | end
476 | 
477 | check_leaks(t)
478 | 
479 | t=@testset "test dones" begin
480 |     @testset "1D dones default element type" begin
481 |         A = dones(10)
482 |         @test A == ones(10)
483 |         @test eltype(A) == Float64
484 |         @test size(A) == (10,)
485 |         close(A)
486 |     end
487 | 
488 |     @testset "1D dones with specified element type" begin
489 |         A = dones(Int, 10)
490 |         @test eltype(A) == Int
491 |         @test size(A) == (10,)
492 |         close(A)
493 |     end
494 | 
495 |     @testset "2D dones default element type, Dims constuctor" begin
496 |         A = dones((10,10))
497 |         @test A == ones((10,10))
498 |         @test eltype(A) == Float64
499 |         @test size(A) == (10,10)
500 |         close(A)
501 |     end
502 | 
503 |     @testset "2D dones specified element type, Dims constructor" begin
504 |         A = dones(Int, (10,10))
505 |         @test A == ones(Int, (10,10))
506 |         @test eltype(A) == Int
507 |         @test size(A) == (10,10)
508 |         close(A)
509 |     end
510 | 
511 |     @testset "2D dones, default element type" begin
512 |         A = dones(10,10)
513 |         @test A == ones(10,10)
514 |         @test eltype(A) == Float64
515 |         @test size(A) == (10,10)
516 |         close(A)
517 |     end
518 | 
519 |     @testset "2D dones, specified element type" begin
520 |         A = dones(Int, 10, 10)
521 |         @test A == ones(Int, 10, 10)
522 |         @test eltype(A) == Int
523 |         @test size(A) == (10,10)
524 |         close(A)
525 |     end
526 | end
527 | 
528 | check_leaks(t)
529 | 
530 | t=@testset "test drand" begin
531 |     @testset "1D drand" begin
532 |         A = drand(100)
533 |         @test eltype(A) == Float64
534 |         @test size(A) == (100,)
535 |         @test all(x-> x >= 0.0 && x <= 1.0, A)
536 |         close(A)
537 |     end
538 | 
539 |     @testset "1D drand, specified element type" begin
540 |         A = drand(Int, 100)
541 |         @test eltype(A) == Int
542 |         @test size(A) == (100,)
543 |         close(A)
544 |     end
545 | 
546 |     @testset "1D drand, UnitRange" begin
547 |         A = drand(1:10, 100)
548 |         @test eltype(A) == Int
549 |         @test size(A) == (100,)
550 |         close(A)
551 |     end
552 | 
553 |     @testset "1D drand, Array" begin
554 |         A = drand([-1,0,1], 100)
555 |         @test eltype(A) == Int
556 |         @test size(A) == (100,)
557 |         close(A)
558 |     end
559 | 
560 |     @testset "2D drand, Dims constructor" begin
561 |         A = drand((50,50))
562 |         @test eltype(A) == Float64
563 |         @test size(A) == (50,50)
564 |         @test all(x-> x >= 0.0 && x <= 1.0, A)
565 |         close(A)
566 |     end
567 | 
568 |     @testset "2D drand" begin
569 |         A = drand(100,100)
570 |         @test eltype(A) == Float64
571 |         @test size(A) == (100,100)
572 |         @test all(x-> x >= 0.0 && x <= 1.0, A)
573 |         close(A)
574 |     end
575 | 
576 |     @testset "2D drand, Dims constructor, specified element type" begin
577 |         A = drand(Int, (100,100))
578 |         @test eltype(A) == Int
579 |         @test size(A) == (100,100)
580 |         close(A)
581 |     end
582 | 
583 |     @testset "2D drand, specified element type" begin
584 |         A = drand(Int, 100, 100)
585 |         @test eltype(A) == Int
586 |         @test size(A) == (100,100)
587 |         close(A)
588 |     end
589 | end
590 | 
591 | check_leaks(t)
592 | 
593 | t=@testset "test randn" begin
594 |     @testset "1D drandn" begin
595 |         A = drandn(100)
596 |         @test eltype(A) == Float64
597 |         @test size(A) == (100,)
598 |         close(A)
599 |     end
600 | 
601 |     @testset "2D drandn, Dims constructor" begin
602 |         A = drandn((50,50))
603 |         @test eltype(A) == Float64
604 |         @test size(A) == (50,50)
605 |         close(A)
606 |     end
607 | 
608 |     @testset "2D drandn" begin
609 |         A = drandn(100,100)
610 |         @test eltype(A) == Float64
611 |         @test size(A) == (100,100)
612 |         close(A)
613 |     end
614 | end
615 | 
616 | check_leaks(t)
617 | 
618 | t=@testset "test c/transpose" begin
619 |     @testset "test ctranspose real" begin
620 |         A = drand(Float64, 100, 200)
621 |         @test A' == Array(A)'
622 |         close(A)
623 |     end
624 |     @testset "test ctranspose complex" begin
625 |         A = drand(Complex128, 200, 100)
626 |         @test A' == Array(A)'
627 |         close(A)
628 |     end
629 |     @testset "test transpose real" begin
630 |         A = drand(Float64, 200, 100)
631 |         @test A.' == Array(A).'
632 |         close(A)
633 |     end
634 |     @testset "test ctranspose complex" begin
635 |         A = drand(Complex128, 100, 200)
636 |         @test A.' == Array(A).'
637 |         close(A)
638 |     end
639 | 
640 |     d_closeall()  # close the temporaries created above
641 | end
642 | 
643 | check_leaks(t)
644 | 
645 | t=@testset "test convert from subdarray" begin
646 |     a = drand(20, 20);
647 | 
648 |     s = view(a, 1:5, 5:8)
649 |     @test isa(s, SubDArray)
650 |     @test s == convert(DArray, s)
651 | 
652 |     s = view(a, 6:5, 5:8)
653 |     @test isa(s, SubDArray)
654 |     @test s == convert(DArray, s)
655 |     close(a)
656 |     d_closeall()  # close the temporaries created above
657 | end
658 | 
659 | check_leaks(t)
660 | 
661 | t=@testset "test scalar math" begin
662 |     a = drand(20, 20);
663 |     b = convert(Array, a)
664 |     @testset "$f" for f in (-, abs, abs2, acos, acosd, acot,
665 |               acotd, acsch, angle, asech, asin,
666 |               asind, asinh, atan, atand, atanh,
667 |               big, cbrt, ceil, cis, complex, conj,
668 |               cos, cosc, cosd, cosh, cospi, cot,
669 |               cotd, coth, csc, cscd, csch, dawson,
670 |               deg2rad, digamma, erf, erfc, erfcinv,
671 |               erfcx, erfi, erfinv, exp, exp10, exp2,
672 |               expm1, exponent, float, floor, gamma, imag,
673 |               invdigamma, isfinite, isinf, isnan, lfact,
674 |               lgamma, log, log10, log1p, log2, rad2deg, real,
675 |               sec, secd, sech, sign, sin, sinc, sind,
676 |               sinh, sinpi, sqrt, tan, tand, tanh, trigamma)
677 |         @test f.(a) == f.(b)
678 |     end
679 |     a = a + 1
680 |     b = b + 1
681 |     @testset "$f" for f in (asec, asecd, acosh, acsc, acscd, acoth)
682 |         @test f.(a) == f.(b)
683 |     end
684 |     close(a)
685 |     d_closeall()  # close the temporaries created above
686 | end
687 | 
688 | check_leaks(t)
689 | 
690 | t=@testset "test mapslices" begin
691 |     A = randn(5,5,5)
692 |     D = distribute(A, procs = workers(), dist = [1, 1, min(nworkers(), 5)])
693 |     @test mapslices(svdvals, D, (1,2)) ≈ mapslices(svdvals, A, (1,2))
694 |     @test mapslices(svdvals, D, (1,3)) ≈ mapslices(svdvals, A, (1,3))
695 |     @test mapslices(svdvals, D, (2,3)) ≈ mapslices(svdvals, A, (2,3))
696 |     @test mapslices(sort, D, (1,)) ≈ mapslices(sort, A, (1,))
697 |     @test mapslices(sort, D, (2,)) ≈ mapslices(sort, A, (2,))
698 |     @test mapslices(sort, D, (3,)) ≈ mapslices(sort, A, (3,))
699 | 
700 |     # issue #3613
701 |     B = mapslices(sum, dones(Float64, (2,3,4), workers(), [1,1,min(nworkers(),4)]), [1,2])
702 |     @test size(B) == (1,1,4)
703 |     @test all(B.==6)
704 | 
705 |     # issue #5141
706 |     C1 = mapslices(x-> maximum(-x), D, [])
707 |     @test C1 == -D
708 | 
709 |     # issue #5177
710 |     c = dones(Float64, (2,3,4,5), workers(), [1,1,1,min(nworkers(),5)])
711 |     m1 = mapslices(x-> ones(2,3), c, [1,2])
712 |     m2 = mapslices(x-> ones(2,4), c, [1,3])
713 |     m3 = mapslices(x-> ones(3,4), c, [2,3])
714 |     @test size(m1) == size(m2) == size(m3) == size(c)
715 | 
716 |     n1 = mapslices(x-> ones(6), c, [1,2])
717 |     n2 = mapslices(x-> ones(6), c, [1,3])
718 |     n3 = mapslices(x-> ones(6), c, [2,3])
719 |     n1a = mapslices(x-> ones(1,6), c, [1,2])
720 |     n2a = mapslices(x-> ones(1,6), c, [1,3])
721 |     n3a = mapslices(x-> ones(1,6), c, [2,3])
722 |     @test (size(n1a) == (1,6,4,5) && size(n2a) == (1,3,6,5) && size(n3a) == (2,1,6,5))
723 |     @test (size(n1) == (6,1,4,5) && size(n2) == (6,3,1,5) && size(n3) == (2,6,1,5))
724 |     close(D)
725 |     close(c)
726 |     d_closeall()  # close the temporaries created above
727 | end
728 | 
729 | check_leaks(t)
730 | 
731 | t=@testset "test scalar ops" begin
732 |     a = drand(20,20)
733 |     b = convert(Array, a)
734 |     c = drand(20,20)
735 |     d = convert(Array, c)
736 | 
737 |     @testset "$f" for f in (:+, :-, :.+, :.-, :.*, :./, :.%)
738 |         x = rand()
739 |         @test @eval ($f)($a, $x) == ($f)($b, $x)
740 |         @test @eval ($f)($x, $a) == ($f)($x, $b)
741 |         @test @eval ($f)($a, $c) == ($f)($b, $d)
742 |     end
743 | 
744 |     close(a)
745 |     close(c)
746 | 
747 |     a = dones(Int, 20, 20)
748 |     b = convert(Array, a)
749 |     @testset "$f" for f in (:.<<, :.>>)
750 |         @test @eval ($f)($a, 2)  == ($f)($b, 2)
751 |         @test @eval ($f)(2, $a)  == ($f)(2, $b)
752 |         @test @eval ($f)($a, $a) == ($f)($b, $b)
753 |     end
754 | 
755 |     @testset "$f" for f in (:rem,)
756 |         x = rand()
757 |         @test @eval ($f).($a, $x) == ($f).($b, $x)
758 |     end
759 |     close(a)
760 |     close(c)
761 |     d_closeall()  # close the temporaries created above
762 | end
763 | 
764 | check_leaks(t)
765 | 
766 | t=@testset "test broadcast ops" begin
767 |     wrkrs = workers()
768 |     nwrkrs = length(wrkrs)
769 |     nrows = 20 * nwrkrs
770 |     ncols = 10 * nwrkrs
771 |     a = drand((nrows,ncols), wrkrs, (1, nwrkrs))
772 |     m = mean(a, 1)
773 |     c = a .- m
774 |     d = convert(Array, a) .- convert(Array, m)
775 |     @test c == d
776 |     d_closeall()
777 | end
778 | 
779 | check_leaks(t)
780 | 
781 | t=@testset "test matrix multiplication" begin
782 |     A = drandn(20,20)
783 |     b = drandn(20)
784 |     B = drandn(20,20)
785 | 
786 |     @test norm(convert(Array, A*b) - convert(Array, A)*convert(Array, b), Inf) < sqrt(eps())
787 |     @test norm(convert(Array, A*B) - convert(Array, A)*convert(Array, B), Inf) < sqrt(eps())
788 |     @test norm(convert(Array, A'*b) - convert(Array, A)'*convert(Array, b), Inf) < sqrt(eps())
789 |     @test norm(convert(Array, A'*B) - convert(Array, A)'*convert(Array, B), Inf) < sqrt(eps())
790 |     close(A)
791 |     close(b)
792 |     close(B)
793 |     d_closeall()  # close the temporaries created above
794 | end
795 | 
796 | check_leaks(t)
797 | 
798 | t=@testset "test norm" begin
799 |     x = drandn(20)
800 | 
801 |     @test abs(norm(x) - norm(convert(Array, x))) < sqrt(eps())
802 |     @test abs(norm(x, 1) - norm(convert(Array, x), 1)) < sqrt(eps())
803 |     @test abs(norm(x, 2) - norm(convert(Array, x), 2)) < sqrt(eps())
804 |     @test abs(norm(x, Inf) - norm(convert(Array, x), Inf)) < sqrt(eps())
805 |     close(x)
806 | end
807 | 
808 | check_leaks(t)
809 | 
810 | t=@testset "test axpy!" begin
811 |     x = drandn(20)
812 |     y = drandn(20)
813 | 
814 |     @test norm(convert(Array, LinAlg.axpy!(2.0, x, copy(y))) - LinAlg.axpy!(2.0, convert(Array, x), convert(Array, y))) < sqrt(eps())
815 |     @test_throws DimensionMismatch LinAlg.axpy!(2.0, x, zeros(length(x) + 1))
816 |     close(x)
817 |     close(y)
818 |     d_closeall()  # close the temporaries created above
819 | end
820 | 
821 | check_leaks(t)
822 | 
823 | t=@testset "test ppeval" begin
824 |     A = drandn((10, 10, nworkers()), workers(), [1, 1, nworkers()])
825 |     B = drandn((10, nworkers()), workers(), [1, nworkers()])
826 | 
827 |     R = zeros(10, nworkers())
828 |     for i = 1:nworkers()
829 |         R[:, i] = convert(Array, A)[:, :, i]*convert(Array, B)[:, i]
830 |     end
831 |     @test convert(Array, ppeval(*, A, B)) ≈ R
832 |     @test sum(ppeval(eigvals, A)) ≈ sum(ppeval(eigvals, A, eye(10, 10)))
833 |     close(A)
834 |     close(B)
835 |     d_closeall()  # close the temporaries created above
836 | end
837 | 
838 | check_leaks(t)
839 | 
840 | t=@testset "test nnz" begin
841 |     A = sprandn(10, 10, 0.5)
842 |     @test nnz(distribute(A)) == nnz(A)
843 | end
844 | 
845 | t=@testset "test matmatmul" begin
846 |     A = drandn(30, 30)
847 |     B = drandn(30, 20)
848 |     a = convert(Array, A)
849 |     b = convert(Array, B)
850 | 
851 |     AB = A * B
852 |     AtB = A.' * B
853 |     AcB = A' * B
854 | 
855 |     ab = a * b
856 |     atb = a.' * b
857 |     acb = a' * b
858 | 
859 |     @test AB ≈ ab
860 |     @test AtB ≈ atb
861 |     @test AcB ≈ acb
862 |     d_closeall()  # close the temporaries created above
863 | end
864 | 
865 | t=@testset "sort, T = $T" for i in 0:6, T in [Int, Float64]
866 |     d = DistributedArrays.drand(T, 10^i)
867 |     @testset "sample = $sample" for sample in Any[true, false, (minimum(d),maximum(d)), rand(T, 10^i>512 ? 512 : 10^i)]
868 |         d2 = DistributedArrays.sort(d; sample=sample)
869 | 
870 |         @test length(d) == length(d2)
871 |         @test sort(convert(Array, d)) == convert(Array, d2)
872 |     end
873 |     d_closeall()  # close the temporaries created above
874 | end
875 | 
876 | check_leaks(t)
877 | 
878 | t=@testset "ddata" begin
879 |     d = ddata(;T=Int, init=I->myid())
880 |     for p in workers()
881 |         @test p == remotecall_fetch(d->d[:L], p, d)
882 |     end
883 |     @test Int[workers()...] == gather(d)
884 | 
885 |     close(d)
886 | 
887 |     d = ddata(;T=Int, data=workers())
888 |     for p in workers()
889 |         @test p == remotecall_fetch(d->d[:L], p, d)
890 |     end
891 |     @test Int[workers()...] == gather(d)
892 | 
893 |     close(d)
894 | 
895 |     d = ddata(;T=Any, init=I->"Hello World!")
896 |     for p in workers()
897 |         @test "Hello World!" == remotecall_fetch(d->d[:L], p, d)
898 |     end
899 |     Any["Hello World!" for p in workers()] == gather(d)
900 | 
901 | 
902 |     close(d)
903 | end
904 | 
905 | check_leaks(t)
906 | 
907 | d_closeall()
908 | 
909 | t=@testset "test for any leaks" begin
910 |     sleep(1.0)     # allow time for any cleanup to complete
911 |     allrefszero = Bool[remotecall_fetch(()->length(DistributedArrays.refs) == 0, p) for p in procs()]
912 |     @test all(allrefszero)
913 | 
914 |     allregistrieszero = Bool[remotecall_fetch(()->length(DistributedArrays.registry) == 0, p) for p in procs()]
915 |     @test all(allregistrieszero)
916 | end
917 | 
918 | 


--------------------------------------------------------------------------------