├── .gitignore ├── CHANGES.md ├── CONTRIBUTING.md ├── LICENSE.md ├── Makefile ├── async_parallel.opam ├── example ├── deep.ml ├── echo_server.ml ├── jbuild ├── log.ml ├── pingpong.ml ├── ring.ml ├── simple.ml └── wide.ml ├── src ├── channel.ml ├── channel.mli ├── hub.ml ├── hub.mli ├── import.ml ├── intf.ml ├── intf.mli ├── jbuild ├── master_process.ml ├── master_process.mli ├── std.ml ├── token.ml ├── token.mli ├── worker_process.ml └── worker_process.mli └── test ├── jbuild ├── parallel_test.ml ├── qtest.ml └── test_remote.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *.install 3 | *.merlin 4 | 5 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | ## 113.33.00 2 | 3 | - Add an option `~close_stdout_and_stderr` to `Async_parallel_deprecated.Std.Parallel.init` 4 | to close the `stdout` & `stderr` fds. 5 | 6 | This is needed when using `Async_parallel_deprecated` in a daemonized processes, such as 7 | the in-development jenga server. 8 | 9 | Without this option, Calling ` Process.run ~prog:"jenga" ~args:`"server";"start"` ` from 10 | build-manager is problematic because the resulting deferred never becomes determined. 11 | 12 | ## 113.24.00 13 | 14 | - Switched to ppx. 15 | 16 | ## 112.35.00 17 | 18 | - Renamed `Async_parallel` as `Async_parallel_deprecated`; one should 19 | use `Rpc_parallel` instead. 20 | 21 | ## 112.17.00 22 | 23 | - Modernize the code 24 | 25 | ## 111.25.00 26 | 27 | - improve error handling 28 | 29 | ## 109.41.00 30 | 31 | Rename library from Parallel to Async_parallel 32 | 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | This repository contains open source software that is developed and 2 | maintained by [Jane Street][js]. 3 | 4 | Contributions to this project are welcome and should be submitted via 5 | GitHub pull requests. 6 | 7 | Signing contributions 8 | --------------------- 9 | 10 | We require that you sign your contributions. Your signature certifies 11 | that you wrote the patch or otherwise have the right to pass it on as 12 | an open-source patch. The rules are pretty simple: if you can certify 13 | the below (from [developercertificate.org][dco]): 14 | 15 | ``` 16 | Developer Certificate of Origin 17 | Version 1.1 18 | 19 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 20 | 1 Letterman Drive 21 | Suite D4700 22 | San Francisco, CA, 94129 23 | 24 | Everyone is permitted to copy and distribute verbatim copies of this 25 | license document, but changing it is not allowed. 26 | 27 | 28 | Developer's Certificate of Origin 1.1 29 | 30 | By making a contribution to this project, I certify that: 31 | 32 | (a) The contribution was created in whole or in part by me and I 33 | have the right to submit it under the open source license 34 | indicated in the file; or 35 | 36 | (b) The contribution is based upon previous work that, to the best 37 | of my knowledge, is covered under an appropriate open source 38 | license and I have the right under that license to submit that 39 | work with modifications, whether created in whole or in part 40 | by me, under the same open source license (unless I am 41 | permitted to submit under a different license), as indicated 42 | in the file; or 43 | 44 | (c) The contribution was provided directly to me by some other 45 | person who certified (a), (b) or (c) and I have not modified 46 | it. 47 | 48 | (d) I understand and agree that this project and the contribution 49 | are public and that a record of the contribution (including all 50 | personal information I submit with it, including my sign-off) is 51 | maintained indefinitely and may be redistributed consistent with 52 | this project or the open source license(s) involved. 53 | ``` 54 | 55 | Then you just add a line to every git commit message: 56 | 57 | ``` 58 | Signed-off-by: Joe Smith 59 | ``` 60 | 61 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 62 | 63 | If you set your `user.name` and `user.email` git configs, you can sign 64 | your commit automatically with git commit -s. 65 | 66 | [dco]: http://developercertificate.org/ 67 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2013--2018 Jane Street Group, LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | INSTALL_ARGS := $(if $(PREFIX),--prefix $(PREFIX),) 2 | 3 | # Default rule 4 | default: 5 | jbuilder build @install 6 | 7 | install: 8 | jbuilder install $(INSTALL_ARGS) 9 | 10 | uninstall: 11 | jbuilder uninstall $(INSTALL_ARGS) 12 | 13 | reinstall: uninstall install 14 | 15 | clean: 16 | rm -rf _build 17 | 18 | .PHONY: default install uninstall reinstall clean 19 | -------------------------------------------------------------------------------- /async_parallel.opam: -------------------------------------------------------------------------------- 1 | opam-version: "1.2" 2 | maintainer: "opensource@janestreet.com" 3 | authors: ["Jane Street Group, LLC "] 4 | homepage: "https://github.com/janestreet/async_parallel" 5 | bug-reports: "https://github.com/janestreet/async_parallel/issues" 6 | dev-repo: "git+https://github.com/janestreet/async_parallel.git" 7 | license: "MIT" 8 | build: [ 9 | ["jbuilder" "build" "-p" name "-j" jobs] 10 | ] 11 | depends: [ 12 | "async" 13 | "core" 14 | "ppx_jane" 15 | "sexplib" 16 | "jbuilder" {build & >= "1.0+beta18.1"} 17 | "ocaml-migrate-parsetree" {>= "1.0"} 18 | "ppxlib" {>= "0.1.0"} 19 | ] 20 | available: [ ocaml-version >= "4.06.1" ] 21 | descr: " 22 | Distributed computing library 23 | 24 | Parallel is a library for running tasks in other processes on a 25 | cluster of machines. 26 | " 27 | -------------------------------------------------------------------------------- /example/deep.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let deep () = 6 | Deferred.List.iter [ 1; 2; 10; 100 ] ~f:(fun depth -> 7 | let rec loop i = 8 | if i = 0 9 | then return 0 10 | else begin 11 | Parallel.run (fun () -> loop (i - 1)) 12 | >>| function 13 | | Error e -> failwith e 14 | | Ok j -> j + 1 15 | end 16 | in 17 | loop depth 18 | >>| fun d -> 19 | assert (d = depth)) 20 | ;; 21 | 22 | let () = 23 | Parallel.init (); 24 | (deep () >>> fun () -> Shutdown.shutdown 0); 25 | never_returns (Scheduler.go ()) 26 | ;; 27 | -------------------------------------------------------------------------------- /example/echo_server.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let p s = Core.Printf.printf "%s: %s\n%!" (Pid.to_string (Unix.getpid ())) s 6 | 7 | let master = Unix.gethostname () 8 | 9 | let echo_server s = 10 | p "echo server running"; 11 | Deferred.create (fun i -> 12 | Pipe.iter' (Hub.listen s) ~f:(fun q -> 13 | Deferred.all_unit 14 | (Queue.fold q ~init:[] ~f:(fun acc x -> 15 | match x with 16 | | `Connect _ -> 17 | p "client connected"; 18 | Deferred.unit :: acc 19 | | `Data (c, `Echo a) -> 20 | p "client sent echo request"; 21 | Hub.send s c (`Echo a); 22 | Hub.flushed s :: acc 23 | | `Data (_, `Die) -> 24 | p "client sent die request"; 25 | Ivar.fill i (); 26 | Deferred.unit :: acc 27 | | `Disconnect _ -> Deferred.unit :: acc))) 28 | >>> fun () -> Ivar.fill_if_empty i ()) 29 | ;; 30 | 31 | let main () = 32 | Parallel.spawn ~where:Parallel.random echo_server >>> fun (c, res) -> 33 | Channel.write c (`Echo "foo"); 34 | Channel.read c >>> fun (`Echo z) -> 35 | assert (z = "foo"); 36 | p "starting more clients"; 37 | Deferred.create (fun iv -> 38 | let rec loop i = 39 | if i > 9999 then Ivar.fill iv () 40 | else begin 41 | p "starting client"; 42 | Parallel.run ~where:Parallel.round_robin (fun () -> 43 | p "second client started, sending echo request with same channel!"; 44 | let id = Int.to_string i in 45 | Channel.write c (`Echo id); 46 | Channel.read c >>= fun (`Echo z) -> 47 | assert (z = id); 48 | p "creating a worker within a worker"; 49 | Parallel.run ~where:Parallel.round_robin (fun () -> 50 | p "worker from worker created"; 51 | Channel.write c (`Echo ("sub" ^ id)); 52 | p "worker from worker channel written"; 53 | Channel.read c >>= (fun (`Echo z) -> 54 | p "worker from worker read echo"; 55 | assert (z = "sub" ^ id); 56 | p "worker from worker assert succeeded"; 57 | Deferred.unit)) 58 | >>| (function 59 | | Error e -> p (sprintf "worker within worker died %s" e) 60 | | Ok () -> p "read worker from worker's result successfully")) 61 | >>> (function 62 | | Error e -> p (sprintf "client died with exception \"%s\"" e) 63 | | Ok () -> 64 | p "read client's result successfully looping"; 65 | loop (i + 1)) 66 | end 67 | in 68 | p "calling loop"; 69 | loop 0) 70 | >>> fun () -> 71 | p "second client done, writing die"; 72 | Channel.write c `Die; 73 | res >>> function 74 | | Error e -> p (sprintf "echo server died with exception %s" e) 75 | | Ok () -> p "echo server dead"; Shutdown.shutdown 0 76 | ;; 77 | 78 | let () = 79 | p "starting the world"; 80 | Exn.handle_uncaught ~exit:true (fun () -> 81 | p "calling Parallel.init"; 82 | Parallel.init ~cluster: 83 | {Cluster. 84 | master_machine = master; 85 | (* Though not exactly recommended this is an example of how flexible the 86 | new library is. *) 87 | worker_machines = []} 88 | (); 89 | p "calling main"; 90 | main (); 91 | p "calling scheduler go"; 92 | never_returns (Scheduler.go ())) 93 | ;; 94 | -------------------------------------------------------------------------------- /example/jbuild: -------------------------------------------------------------------------------- 1 | (executables 2 | ((names (echo_server ring simple pingpong wide deep log)) 3 | (libraries (async 4 | async_parallel_deprecated 5 | core)) 6 | (preprocess (pps (ppx_jane ppxlib.runner))))) 7 | 8 | 9 | (jbuild_version 1) 10 | -------------------------------------------------------------------------------- /example/log.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | module Glog = Log.Global 5 | 6 | let p s = Glog.info "%s %s: %s\n%!" (Unix.gethostname ()) (Pid.to_string (Unix.getpid ())) s 7 | 8 | let foo () = 9 | p "solving..."; 10 | Clock.after (sec 1.) >>| fun () -> "bar" 11 | ;; 12 | 13 | let main () = 14 | Parallel.run ~where:(`On "machine1") foo >>> function 15 | | Error e -> p (sprintf "died with exception %s" e) 16 | | Ok str -> 17 | p (sprintf "main process gets the result: %s" str); 18 | Shutdown.shutdown 0 19 | ;; 20 | 21 | let () = 22 | Exn.handle_uncaught ~exit:true (fun () -> 23 | Parallel.init ~cluster: 24 | {Cluster.master_machine = Unix.gethostname (); 25 | worker_machines = ["machine1"; "machine2"]} (); 26 | p "calling main"; 27 | main (); 28 | p "calling scheduler go"; 29 | never_returns (Scheduler.go ())) 30 | ;; 31 | -------------------------------------------------------------------------------- /example/pingpong.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let p s = Core.Printf.printf "%s: %s\n%!" (Pid.to_string (Unix.getpid ())) s 6 | 7 | let foo h = 8 | Pipe.iter_without_pushback (Hub.listen_simple h) ~f:(fun (id, `Ping) -> 9 | p "read ping"; 10 | Hub.send h id `Pong; 11 | p "sent pong") 12 | >>| fun () -> `Done 13 | ;; 14 | 15 | let main () = 16 | Parallel.spawn ~where:Parallel.random foo >>> fun (c, _res) -> 17 | let rec loop () = 18 | Channel.write c `Ping; 19 | Channel.read c >>> fun `Pong -> 20 | Clock.after (sec 1.) >>> loop 21 | in 22 | loop (); 23 | Clock.after (sec 60.) >>> fun () -> Shutdown.shutdown 0 24 | ;; 25 | 26 | let () = 27 | Exn.handle_uncaught ~exit:true (fun () -> 28 | Parallel.init ~cluster: 29 | {Cluster.master_machine = Unix.gethostname (); 30 | worker_machines = ["hkg-qws-r01"; "hkg-qws-r02"]} (); 31 | main (); 32 | never_returns (Scheduler.go ())) 33 | ;; 34 | -------------------------------------------------------------------------------- /example/ring.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let p s = Core.Printf.printf "%s: %s\n%!" (Pid.to_string (Unix.getpid ())) s 6 | 7 | let add c f = 8 | (* Run the ring nodes on either of these two machines, but never on the master. *) 9 | Parallel.spawn ~where:(Parallel.random_in ["hkg-qws-r02"; "hkg-qws-r01"]) (fun h -> 10 | Pipe.iter_without_pushback (Hub.listen h) ~f:ignore 11 | >>> (fun () -> p "hub closed"); 12 | let rec loop () = 13 | Channel.read c >>> fun a -> 14 | f a >>> fun b -> 15 | Hub.send_to_all h b; 16 | Hub.flushed h 17 | >>> fun () -> 18 | loop () 19 | in 20 | loop (); 21 | Deferred.never ()) 22 | >>| fun (c, _) -> c 23 | ;; 24 | 25 | let main () = 26 | p "creating hub"; 27 | Parallel.hub () >>> fun hub -> 28 | Pipe.iter_without_pushback (Hub.listen hub) ~f:ignore 29 | >>> (fun () -> p "main hub closed"); 30 | p "hub created"; 31 | Hub.open_channel hub >>> fun c -> 32 | p "channel of hub created"; 33 | add c (fun x -> p (sprintf "adding 1 to %d" x); return (x + 1)) >>> fun c -> 34 | p "first process added"; 35 | add c (fun x -> p (sprintf "adding 2 to %d" x); return (x + 2)) >>> fun c -> 36 | p "second process added"; 37 | add c (fun x -> p (sprintf "adding 3 to %d" x); return (x + 3)) >>> fun c -> 38 | p "third process added"; 39 | let rec loop i = 40 | if i < 1_000_000 then begin 41 | let res = Channel.read c in 42 | Hub.send_to_all hub i; 43 | Hub.flushed hub 44 | >>> fun () -> 45 | p "sent"; 46 | res >>> fun i -> 47 | p (sprintf "adding 4 to %d" i); 48 | loop (i + 4) 49 | end else 50 | Pervasives.exit 0 51 | in 52 | after (sec 1.) >>> fun () -> 53 | loop 0 54 | ;; 55 | 56 | let () = 57 | Parallel.init ~cluster: 58 | {Cluster.master_machine = Unix.gethostname (); 59 | worker_machines = ["hkg-qws-r01"; "hkg-qws-r02"]} (); 60 | main (); 61 | never_returns (Scheduler.go ()) 62 | ;; 63 | -------------------------------------------------------------------------------- /example/simple.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let p s = Core.Printf.printf "%s: %s\n%!" (Pid.to_string (Unix.getpid ())) s 6 | 7 | let foo () = 8 | p "solving..."; 9 | Clock.after (sec 1.) >>| fun () -> "bar" 10 | ;; 11 | 12 | let main () = 13 | Parallel.run ~where:(`On "hkg-qws-r01") foo >>> function 14 | | Error e -> p (sprintf "died with exception %s" e) 15 | | Ok str -> 16 | p (sprintf "main process gets the result: %s" str); 17 | Shutdown.shutdown 0 18 | ;; 19 | 20 | let () = 21 | Exn.handle_uncaught ~exit:true (fun () -> 22 | Parallel.init ~cluster: 23 | {Cluster.master_machine = Unix.gethostname (); 24 | worker_machines = ["hkg-qws-r01"; "hkg-qws-r02"]} (); 25 | p "calling main"; 26 | main (); 27 | p "calling scheduler go"; 28 | never_returns (Scheduler.go ())) 29 | ;; 30 | -------------------------------------------------------------------------------- /example/wide.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let wide () = 6 | Deferred.List.iter [ 1; 2; 10; 100 ] ~f:(fun num_children -> 7 | Core.Printf.printf "creating: %d\n%!" num_children; 8 | Deferred.all 9 | (List.init num_children ~f:(fun i -> 10 | Parallel.run (fun () -> 11 | Core.Printf.printf "i: %d\n%!" i; 12 | return i) 13 | >>| function 14 | | Error e -> failwith e 15 | | Ok i -> i)) 16 | >>| fun l -> 17 | Core.Printf.printf "done!\n%!"; 18 | assert (l = List.init num_children ~f:Fn.id); 19 | Core.Printf.printf "assert ok\n%!") 20 | ;; 21 | 22 | let () = 23 | Parallel.init (); 24 | (wide () >>> fun () -> 25 | Core.Printf.printf "shutdown\n%!"; 26 | Shutdown.shutdown 0); 27 | never_returns (Scheduler.go ()) 28 | ;; 29 | -------------------------------------------------------------------------------- /src/channel.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | module Std_unix = Unix 3 | open Async 4 | open Import 5 | 6 | let _p s = Core.Printf.printf "%s: %s\n%!" (Time.to_string (Time.now ())) s 7 | let _size a = String.length (Marshal.to_string a [Marshal.Closures]) 8 | 9 | type reified = 10 | { fd: Std_unix.File_descr.t; 11 | reader: Reader.t; 12 | writer: Writer.t; 13 | } 14 | 15 | let next_id = ref 0 16 | let reified = Int.Table.create () 17 | 18 | 19 | type ('a, 'b) t = 20 | { socket : Unix.Inet_addr.t * int; 21 | mutable token : Token.t; 22 | mutable state: [ `Unconnected 23 | | `Reified of int 24 | | `Reifying of Bigstring.t Queue.t * unit Deferred.t 25 | | `Dead of exn ]; 26 | buffer_age_limit:[ `At_most of Time.Span.t | `Unlimited ] option; 27 | mutable errors : exn Tail.t option; 28 | } 29 | 30 | let socket t = t.socket 31 | 32 | let unreify t r id = 33 | don't_wait_for (Monitor.try_with (fun () -> Writer.close r.writer) >>| ignore); 34 | don't_wait_for (Monitor.try_with (fun () -> Reader.close r.reader) >>| ignore); 35 | Hashtbl.remove reified id; 36 | t.state <- `Unconnected 37 | 38 | let on_error t exn = 39 | match t.errors with 40 | | None -> raise exn 41 | | Some errors -> Tail.extend errors exn; 42 | ;; 43 | 44 | let reify t id = 45 | let s = Socket.create Socket.Type.tcp in 46 | socket_connect_inet s t.socket 47 | >>| fun s -> 48 | let r = 49 | { fd = Unix.Fd.file_descr_exn (Socket.fd s); 50 | reader = Reader.create (Socket.fd s); 51 | writer = Writer.create ?buffer_age_limit:t.buffer_age_limit (Socket.fd s); 52 | } 53 | in 54 | let monitor = Monitor.detach_and_get_error_stream (Writer.monitor r.writer) in 55 | Stream.iter_durably monitor ~f:(fun exn -> 56 | unreify t r id; 57 | on_error t exn); 58 | r 59 | ;; 60 | 61 | let errors t = 62 | let errors = 63 | match t.errors with 64 | | None -> 65 | let errors = Tail.create () in 66 | t.errors <- Some errors; 67 | errors 68 | | Some errors -> errors 69 | in 70 | Tail.collect errors 71 | ;; 72 | 73 | let create ?buffer_age_limit ~addr () = 74 | return { 75 | socket = addr; 76 | token = Token.mine; 77 | state = `Unconnected; 78 | buffer_age_limit; 79 | errors = None; 80 | } 81 | ;; 82 | 83 | let rec rereify t = 84 | t.token <- Token.mine; 85 | let q = Queue.create () in 86 | let r = 87 | let id = !next_id in 88 | incr next_id; 89 | reify t id >>| (fun r -> 90 | match t.state with 91 | | `Unconnected -> assert false 92 | | `Dead _ -> 93 | (* Being here implies we wern't sent to another process (because the deferred 94 | returned by reify can never be filled if we are), so it's ok to close the 95 | writer. *) 96 | don't_wait_for 97 | (Monitor.try_with 98 | (fun () -> Writer.close r.writer) >>| ignore) 99 | | `Reified _ -> assert false 100 | | `Reifying _ -> 101 | t.state <- `Reified id; 102 | Hashtbl.set reified ~key:id ~data:r; 103 | Queue.iter q ~f:(fun v -> write_bigstring ~can_destroy:false t v)) 104 | in 105 | t.state <- `Reifying (q, r); 106 | r 107 | 108 | and write_bigstring ~can_destroy t v = 109 | match t.state with 110 | | `Dead exn -> on_error t exn 111 | | _ -> 112 | if not (Token.valid t.token) then begin 113 | let _ = rereify t in 114 | write_bigstring ~can_destroy t v 115 | end else begin 116 | match t.state with 117 | | `Unconnected -> 118 | don't_wait_for (rereify t); 119 | (* Calling [write_bigstring] isn't an infinite loop, because [rereify] changes the 120 | state of [t] to [`Reifying]. *) 121 | write_bigstring ~can_destroy t v; 122 | | `Reified r -> 123 | begin match Hashtbl.find reified r with 124 | | None -> assert false 125 | | Some r -> 126 | try 127 | Writer.schedule_bigstring r.writer v; 128 | if can_destroy 129 | then (Writer.flushed r.writer >>> fun () -> Bigstring.unsafe_destroy v) 130 | else () 131 | with exn -> on_error t exn 132 | end 133 | | `Reifying (q, _) -> Queue.enqueue q v 134 | | `Dead _ -> assert false 135 | end 136 | ;; 137 | 138 | type 'a pre_packed = Bigstring.t 139 | 140 | let pre_pack v = 141 | try 142 | Bigstring_marshal.marshal ~flags:[Marshal.Closures] v 143 | with e -> 144 | let tag = Obj.tag (Obj.repr v) in 145 | let size = Obj.size (Obj.repr v) in 146 | let subv = Obj.field (Obj.repr v) 0 in 147 | Core.Printf.printf "Channel.pre_pack: tag=%d; size=%d; exn %s\n%!" tag size (Exn.to_string e); 148 | Core.Printf.printf "subv: tag = %d; size = %d\n%!" (Obj.tag subv) (Obj.size subv); 149 | List.iter ~f:(fun (name, tag) -> Core.Printf.printf "%s = %d\n%!" name tag) [ 150 | "custom_tag", Obj.custom_tag; 151 | "lazy_tag", Obj.lazy_tag; 152 | "closure_tag", Obj.closure_tag; 153 | "object_tag", Obj.object_tag; 154 | "infix_tag", Obj.infix_tag; 155 | "forward_tag", Obj.forward_tag; 156 | "no_scan_tag", Obj.no_scan_tag; 157 | "abstract_tag", Obj.abstract_tag; 158 | "string_tag", Obj.string_tag; 159 | "double_tag", Obj.double_tag; 160 | "double_array_tag", Obj.double_array_tag; 161 | "custom_tag", Obj.custom_tag; 162 | "int_tag", Obj.int_tag; 163 | "out_of_heap_tag", Obj.out_of_heap_tag; 164 | "unaligned_tag", Obj.unaligned_tag; 165 | ]; 166 | raise e 167 | ;; 168 | 169 | let write_pre_packed t v = write_bigstring ~can_destroy:false t v 170 | 171 | let write t v = 172 | let bs = Bigstring_marshal.marshal ~flags:[Marshal.Closures] v in 173 | write_bigstring ~can_destroy:true t bs 174 | 175 | let check_not_dead t = 176 | match t.state with 177 | | `Dead e -> raise e 178 | | _ -> () 179 | ;; 180 | 181 | let rec read_full t = 182 | check_not_dead t; 183 | if not (Token.valid t.token) then 184 | rereify t >>= (fun () -> read_full t) 185 | else begin 186 | match t.state with 187 | | `Unconnected -> rereify t >>= (fun () -> read_full t) 188 | | `Reified id -> 189 | begin match Hashtbl.find reified id with 190 | | None -> assert false 191 | | Some r -> 192 | Monitor.try_with (fun () -> Reader.read_marshal r.reader) 193 | >>| function 194 | | Ok res -> res 195 | | Error exn -> unreify t r id; raise exn 196 | end 197 | | `Reifying (_, r) -> r >>= (fun () -> read_full t) 198 | | `Dead exn -> raise exn 199 | end 200 | ;; 201 | 202 | let read t = 203 | read_full t >>| function 204 | | `Eof -> raise End_of_file 205 | | `Ok a -> a 206 | ;; 207 | 208 | exception Closed [@@deriving sexp] 209 | 210 | let close t = 211 | if Token.valid t.token then begin 212 | match t.state with 213 | | `Unconnected | `Dead _ -> return () 214 | | `Reifying _ -> t.state <- `Dead Closed; return () 215 | | `Reified r -> 216 | t.state <- `Dead Closed; 217 | begin match Hashtbl.find reified r with 218 | | None -> () 219 | | Some r -> 220 | ignore (Monitor.try_with (fun () -> 221 | Writer.close r.writer 222 | >>= fun () -> 223 | Reader.close r.reader)) 224 | end; 225 | Hashtbl.remove reified r; 226 | return () 227 | end else 228 | return () 229 | ;; 230 | 231 | let rec flushed t = 232 | check_not_dead t; 233 | if not (Token.valid t.token) then Deferred.unit 234 | else begin 235 | match t.state with 236 | | `Unconnected -> Deferred.unit 237 | | `Dead e -> raise e 238 | | `Reifying (_, r) -> r >>= (fun () -> flushed t) 239 | | `Reified r -> 240 | begin match Hashtbl.find reified r with 241 | | None -> assert false 242 | | Some r -> Writer.flushed r.writer 243 | end 244 | end 245 | -------------------------------------------------------------------------------- /src/channel.mli: -------------------------------------------------------------------------------- 1 | (** A [Channel.t] is a bi-directional communication channel for communicating to a 2 | [Hub.t]. Channels are portable across processes. A channel can be sent to another 3 | process, either explicitly or by being in a closure and it will continue to work. *) 4 | 5 | open! Core 6 | open! Async 7 | open! Import 8 | 9 | type ('to_hub, 'from_hub) t 10 | 11 | (** [create] is type-unsafe, and should not be used by user code. *) 12 | val create 13 | : ?buffer_age_limit:[ `At_most of Time.Span.t | `Unlimited ] 14 | -> addr:Unix.Inet_addr.t * int 15 | -> unit 16 | -> (_, _) t Deferred.t 17 | val close : (_ , _ ) t -> unit Deferred.t 18 | val read : (_ , 'b) t -> 'b Deferred.t 19 | val read_full : (_ , 'b) t -> 'b Reader.Read_result.t Deferred.t 20 | val write : ('a, _ ) t -> 'a -> unit 21 | 22 | type 'a pre_packed 23 | val pre_pack : 'a -> 'a pre_packed 24 | val write_pre_packed : ('a, _) t -> 'a pre_packed -> unit 25 | 26 | val flushed : (_, _) t -> unit Deferred.t 27 | val socket : (_, _) t -> Unix.Inet_addr.t * int 28 | 29 | (** Similar to [Monitor.detach_and_get_error_stream], collects all writer errors. If this function has never 30 | been called, then exceptions will be raised directly *) 31 | val errors : (_, _) t -> exn Stream.t 32 | -------------------------------------------------------------------------------- /src/hub.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Import 4 | 5 | module Client_id : sig 6 | type t [@@deriving sexp] 7 | 8 | include Comparable with type t := t 9 | include Hashable with type t := t 10 | 11 | val zero : t 12 | val succ : t -> t 13 | end = Int 14 | 15 | type con = { 16 | reader: Reader.t; 17 | writer: Writer.t; 18 | } 19 | 20 | type 'a update = 21 | [ `Connect of Client_id.t 22 | | `Disconnect of Client_id.t * string 23 | | `Data of (Client_id.t * 'a) 24 | ] 25 | 26 | type ('a, 'b) t = 27 | { mutable next_client: Client_id.t; 28 | mutable listening: bool; 29 | shutdown: unit Ivar.t; 30 | is_shutdown: unit Ivar.t; 31 | pipe_r: 'a update Pipe.Reader.t; 32 | pipe_w: 'a update Pipe.Writer.t; 33 | clients: (Client_id.t, con) Hashtbl.t; 34 | socket: ([`Passive], Socket.Address.Inet.t) Socket.t; 35 | addr : Unix.Inet_addr.t * int; 36 | token: Token.t; 37 | buffer_age_limit: Writer.buffer_age_limit option; 38 | } 39 | 40 | exception Hubs_are_not_portable_between_processes 41 | 42 | let close t = 43 | if not t.listening then 44 | Monitor.try_with (fun () -> Unix.close (Socket.fd t.socket)) >>| ignore 45 | else begin 46 | Ivar.fill_if_empty t.shutdown (); 47 | Ivar.read t.is_shutdown 48 | end 49 | 50 | let create ?buffer_age_limit socket = 51 | let ip = Lazy.force my_ip in 52 | let `Inet (_, port) = Socket.getsockname socket in 53 | let (pipe_r, pipe_w) = Pipe.create () in 54 | return 55 | { next_client = Client_id.zero; 56 | listening = false; 57 | shutdown = Ivar.create (); 58 | is_shutdown = Ivar.create (); 59 | pipe_r; 60 | pipe_w; 61 | clients = Hashtbl.Poly.create () ~size:1; 62 | socket; 63 | addr = (ip, port); 64 | buffer_age_limit; 65 | token = Token.mine; } 66 | ;; 67 | 68 | let handle_client t ~close ~conn ~id ~stop = 69 | let read () = try_with (fun () -> Reader.read_marshal conn.reader) in 70 | let rec loop () = 71 | let res = 72 | choose 73 | [ choice stop (fun () -> `Stop); 74 | choice (read ()) (fun x -> `Read x); 75 | ] 76 | in 77 | res >>> function 78 | | `Stop -> () 79 | | `Read x -> 80 | match x with 81 | | Error e -> close (Exn.to_string e) 82 | | Ok x -> 83 | match x with 84 | | `Eof -> close "saw EOF while reading" 85 | | `Ok a -> 86 | Pipe.write t.pipe_w (`Data (id, a)) 87 | >>> fun () -> 88 | if Option.is_none (Deferred.peek stop) then loop () 89 | in 90 | loop () 91 | ;; 92 | 93 | let listener t = 94 | let shutdown = Ivar.read t.shutdown in 95 | let rec loop () = 96 | try_with (fun () -> Socket.accept_interruptible ~interrupt:shutdown t.socket) 97 | >>> function 98 | | Ok `Interrupted -> begin 99 | let cl = Hashtbl.data t.clients in 100 | Deferred.List.iter ~how:`Parallel cl ~f:(fun {reader=_; writer} -> 101 | Monitor.try_with (fun () -> Writer.close writer) >>| ignore) 102 | >>> fun () -> 103 | Monitor.try_with (fun () -> Unix.close (Socket.fd t.socket)) 104 | >>> fun _ -> 105 | Ivar.fill_if_empty t.is_shutdown () 106 | end 107 | | Error e -> 108 | Monitor.send_exn (Monitor.current ()) e; 109 | Clock.after (sec 0.5) >>> loop 110 | | Ok `Socket_closed -> () 111 | | Ok (`Ok (sock, _)) -> 112 | let fd = Socket.fd sock in 113 | let id = 114 | let id = t.next_client in 115 | t.next_client <- Client_id.succ id; 116 | id 117 | in 118 | let conn = { writer = Writer.create ?buffer_age_limit:t.buffer_age_limit fd; 119 | reader = Reader.create fd } in 120 | let closed = Ivar.create () in 121 | let close = 122 | let error = ref "" in 123 | let close = 124 | lazy (Pipe.write t.pipe_w (`Disconnect (id, !error)) 125 | >>> fun () -> 126 | don't_wait_for 127 | (Monitor.try_with (fun () -> 128 | Writer.close conn.writer) >>| ignore); 129 | Hashtbl.remove t.clients id; 130 | Ivar.fill closed ()) 131 | in 132 | (fun e -> error := e; Lazy.force close) 133 | in 134 | Stream.iter (Monitor.detach_and_get_error_stream (Writer.monitor conn.writer)) ~f:(fun e -> 135 | let s = Exn.to_string e in 136 | Core.Printf.printf "%s hub writer error %s\n%!" (Pid.to_string (Unix.getpid ())) s; 137 | close s); 138 | Hashtbl.add_exn t.clients ~key:id ~data:conn; 139 | handle_client t ~close ~conn ~id ~stop:(Ivar.read closed); 140 | Pipe.write t.pipe_w (`Connect id) 141 | >>> loop 142 | in 143 | loop () 144 | ;; 145 | 146 | let ensure_valid t = 147 | if not (Token.valid t.token) then raise Hubs_are_not_portable_between_processes 148 | ;; 149 | 150 | let listen t = 151 | ensure_valid t; 152 | if t.listening then failwith "already listening"; 153 | t.listening <- true; 154 | listener t; 155 | t.pipe_r 156 | ;; 157 | 158 | let listen_simple t = 159 | Pipe.filter_map (listen t) ~f:(function 160 | | `Connect _ | `Disconnect _ -> None 161 | | `Data (c, a) -> Some (c, a)) 162 | ;; 163 | 164 | let send t id a = 165 | ensure_valid t; 166 | match Hashtbl.find t.clients id with 167 | | None -> () 168 | | Some conn -> 169 | write_marshal conn.writer ~flags:[Marshal.Closures] a 170 | ;; 171 | 172 | let send_to_all t a = 173 | ensure_valid t; 174 | let s = Marshal.to_string a [Marshal.Closures] in 175 | Hashtbl.iteri t.clients ~f:(fun ~key:_ ~data:conn -> Writer.write conn.writer s); 176 | ;; 177 | 178 | let flushed t = 179 | Deferred.all_unit 180 | (Hashtbl.fold t.clients ~init:[] ~f:(fun ~key:_ ~data:conn acc -> 181 | Writer.flushed conn.writer :: acc)) 182 | ;; 183 | 184 | let open_channel t = Channel.create ~addr:t.addr () 185 | let socket t = t.addr 186 | let clients t = Hashtbl.keys t.clients 187 | -------------------------------------------------------------------------------- /src/hub.mli: -------------------------------------------------------------------------------- 1 | (** A hub is a place to which any number (possibly zero) of clients can connect a channel 2 | and send messages. The process in which the hub is created is responsible for 3 | listening to the messages clients send, and can send messages to an individual client, 4 | or broadcast a message to all clients. 5 | 6 | Unless otherwise noted none of the below functions may be called in a process other 7 | than the one that created the hub. *) 8 | 9 | open! Core 10 | open! Async 11 | open! Import 12 | 13 | module Client_id : sig 14 | type t [@@deriving sexp] 15 | include Comparable with type t := t 16 | include Hashable with type t := t 17 | end 18 | 19 | type ('from_client, 'to_client) t 20 | 21 | val create : ?buffer_age_limit:Writer.buffer_age_limit 22 | -> ([`Passive], Socket.Address.Inet.t) Socket.t 23 | -> (_, _) t Deferred.t 24 | 25 | (* Note, this will close the listening socket as well as all connected clients. *) 26 | val close : (_, _) t -> unit Deferred.t 27 | 28 | (** [listen] and [listen_simple] start a loop that accepts connections from clients 29 | that wish to open channels connected to the hub. [listen_simple] returns the 30 | sequence of messages sent by clients. [listen] returns those, intermixed with 31 | messages indicating when clients [`Connect] and [`Disconnect]. 32 | 33 | [listen] or [listen_simple] should be called exactly once for a given hub. 34 | Subsequent calls will raise. *) 35 | val listen 36 | : ('a, _) t 37 | -> [ `Connect of Client_id.t 38 | | `Disconnect of Client_id.t * string 39 | | `Data of Client_id.t * 'a 40 | ] Pipe.Reader.t 41 | 42 | val listen_simple : ('a, _) t -> (Client_id.t * 'a) Pipe.Reader.t 43 | 44 | val send : (_, 'a) t -> Client_id.t -> 'a -> unit 45 | val send_to_all : (_, 'a) t -> 'a -> unit 46 | val flushed : (_, _) t -> unit Deferred.t 47 | 48 | val clients : (_, _) t -> Client_id.t list 49 | 50 | (** open_channel may be called even in a different process than the creator of the hub. *) 51 | val open_channel : ('a, 'b) t -> ('a, 'b) Channel.t Deferred.t 52 | val socket : (_, _) t -> Unix.Inet_addr.t * int 53 | -------------------------------------------------------------------------------- /src/import.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | module U = Unix 3 | 4 | module Socket_file : sig 5 | type t [@@deriving sexp_of] 6 | 7 | include Stringable with type t := t 8 | end = String 9 | 10 | module Debug = Async_kernel.Async_kernel_private.Debug 11 | 12 | let debug = Debug.parallel 13 | 14 | let oc = lazy (Out_channel.create (sprintf "/tmp/%d.log" (Pid.to_int (Unix.getpid ())))) 15 | 16 | let dbp msg = 17 | let oc = Lazy.force oc in 18 | Out_channel.output_string oc msg; 19 | Out_channel.newline oc; 20 | Out_channel.flush oc 21 | 22 | let my_ip = lazy begin 23 | try 24 | (match U.Host.getbyname (U.gethostname ()) with 25 | | None -> U.Inet_addr.localhost 26 | | Some host -> 27 | let addrs = 28 | Array.filter host.U.Host.addresses ~f:(fun a -> a <> U.Inet_addr.localhost) 29 | in 30 | if Array.length addrs = 0 then 31 | U.Inet_addr.localhost 32 | else 33 | addrs.(0)) 34 | with _ -> U.Inet_addr.localhost 35 | end 36 | 37 | module Cluster = struct 38 | type t = { 39 | master_machine: string; (* DNS name of the machine that will start it all *) 40 | worker_machines: string list; (* DNS name of worker machines *) 41 | } [@@deriving sexp, bin_io] 42 | end 43 | 44 | let socket_connect_inet socket (addr, port) = 45 | let addr = 46 | if U.Inet_addr.(=) addr (Lazy.force my_ip) 47 | then begin 48 | U.Inet_addr.localhost 49 | end 50 | else addr 51 | in 52 | Async.Socket.connect socket (`Inet (addr, port)) 53 | 54 | include (struct 55 | let write_marshal writer ~flags v = 56 | Async.Writer.schedule_iovec writer 57 | (Unix.IOVec.of_bigstring (Bigstring_marshal.marshal ~flags v)) 58 | ~destroy_or_keep:Destroy 59 | ;; 60 | end : sig 61 | (** [write_marshal] serializes data using [marshal] and writes it to the writer. *) 62 | val write_marshal : Async.Writer.t -> flags : Marshal.extern_flags list -> _ -> unit 63 | end) 64 | -------------------------------------------------------------------------------- /src/intf.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Import 4 | 5 | let master_pid = ref None 6 | 7 | let init ?cluster ?close_stdout_and_stderr ?(fail_if_async_has_been_initialized = true) () = 8 | if fail_if_async_has_been_initialized && not (Scheduler.is_ready_to_initialize ()) then 9 | failwith "Parallel.init called after async was initialized"; 10 | master_pid := Some (Master_process.init ?cluster ?close_stdout_and_stderr ()); 11 | ;; 12 | 13 | let shutdown () = 14 | if debug then dbp "Intf.shutdown called"; 15 | match !master_pid with 16 | | None -> return (Error (Error.of_string "\ 17 | Called Parallel.shutdown incorrectly. 18 | Either parallel isn't running or we're not the main process. 19 | ")) 20 | | Some master_pid -> 21 | Master_process.shutdown () 22 | >>= fun () -> 23 | Unix.waitpid master_pid 24 | >>| fun result -> 25 | match result with 26 | | Ok () -> Ok () 27 | | Error exit_or_signal -> 28 | Error (Error.create "Parallel's master process did not exit cleanly" exit_or_signal 29 | ([%sexp_of: Unix.Exit_or_signal.error])) 30 | ;; 31 | 32 | let spawn (type a) (type b) (type c) 33 | ?buffer_age_limit ?where (f : (a, b) Hub.t -> c Deferred.t) = 34 | Master_process.create_process ?where () >>= function 35 | | Error e -> failwithf "error talking to master process %s" (Error.to_string_hum e) () 36 | | Ok (addr, port) -> 37 | let module C = Channel in 38 | let module T = Worker_process.To_worker in 39 | let module F = Worker_process.From_worker in 40 | C.create ?buffer_age_limit ~addr:(addr, port) () 41 | (* [control_channel] is the first connection that the worker accepts. We write on it 42 | the task to run, and then wait for the worker to send back the result. *) 43 | >>= fun (control_channel : ((a, b, c) T.t, c F.t) C.t) -> 44 | C.create ?buffer_age_limit ~addr:(addr, port) () 45 | >>= fun data_channel -> 46 | let close c = 47 | don't_wait_for (Monitor.try_with (fun () -> C.close c) >>| ignore) 48 | in 49 | C.write control_channel (Worker_process.To_worker.Run (buffer_age_limit, f)); 50 | C.flushed control_channel 51 | >>| fun () -> 52 | let res = 53 | C.read_full control_channel >>| (function 54 | | `Eof -> Error "Eof while reading process result" 55 | | `Ok r -> 56 | match r with 57 | | F.Result a -> Ok a 58 | | F.Exn e -> Error e) 59 | in 60 | upon res (fun _ -> close control_channel; close data_channel); 61 | (data_channel, res) 62 | ;; 63 | 64 | let run ?buffer_age_limit ?where f = 65 | spawn ?buffer_age_limit ?where (fun _ -> f ()) >>= fun (_, res) -> res 66 | ;; 67 | 68 | let st = lazy (Random.State.make_self_init ()) 69 | let hub ?buffer_age_limit () = 70 | let rec pick_port port = 71 | let s = Socket.create Socket.Type.tcp in 72 | Monitor.try_with (fun () -> 73 | Socket.bind s (Socket.Address.Inet.create Unix.Inet_addr.bind_any ~port)) 74 | >>= function 75 | | Ok s -> return s 76 | | Error r -> 77 | match Monitor.extract_exn r with 78 | | Unix.Unix_error (EADDRINUSE, _, _) -> 79 | ignore (Monitor.try_with (fun () -> Unix.close (Socket.fd s))); 80 | pick_port (port + 1 + Random.State.int (Lazy.force st) 10) 81 | | exn -> raise exn 82 | in 83 | pick_port 10000 >>= fun s -> 84 | Hub.create ?buffer_age_limit (Socket.listen s) 85 | ;; 86 | 87 | let is_worker_machine () = Core.Sys.getenv "ASYNC_PARALLEL_IS_CHILD_MACHINE" <> None 88 | 89 | let round_robin = Master_process.round_robin 90 | let random = Master_process.random 91 | let random_in = Master_process.random_in 92 | -------------------------------------------------------------------------------- /src/intf.mli: -------------------------------------------------------------------------------- 1 | open! Core 2 | open! Async 3 | open! Import 4 | 5 | (** [init] initializes the system and creates the master process. [master_init], if 6 | specified, is called in the master process and may be used for cleanup/initialization 7 | such as closing file descriptors. [init] should be called before any threads are 8 | created. If your program daemonizes, call [init] after you daemonize, but before you 9 | start the async scheduler. [init] may only be called once. 10 | 11 | If [cluster] is specified, and it specifies a set of host names or ips that you can 12 | ssh to without a password, and have permission to run programs on, then a master 13 | process will also be started on every machine in the cluster, and worker processes may 14 | be spawned on any machine in the cluster. Note that in ord 15 | *) 16 | val init : ?cluster:Cluster.t 17 | (** Because [init] forks, it is very hard to reason about your program if anything 18 | asyncy has already happened. So, by default, [Parallel.init] fails if you call it 19 | after doing anything asyncy (defined as "have created the async scheduler"). You 20 | can override this behavior, but it would be much better to change your program to 21 | call [init] earlier. 22 | 23 | [~close_stdout_and_stderr] will close [stdout] and [stderr] in the master process. 24 | 25 | [~fail_if_async_has_been_initialized] exists only because of code that existed prior 26 | to [init] checking whether the Async scheduler has been initialized. That old code 27 | uses [~fail_if_async_has_been_initialized:false]. All new code should not supply 28 | [~fail_if_async_has_been_initialized] and should accept the default. *) 29 | -> ?close_stdout_and_stderr : bool (* default is false *) 30 | -> ?fail_if_async_has_been_initialized : bool (* default is true *) 31 | -> unit 32 | -> unit 33 | 34 | (** [shutdown] requests that the master process kill all workers and then shutdown. It 35 | then waits for the master process to exit. [shutdown] returns [Ok ()] when the master 36 | exits without problems; otherwise it returns an error. *) 37 | val shutdown : unit -> unit Or_error.t Deferred.t 38 | 39 | 40 | 41 | (** Run the specified closure in another process and return its result. 42 | 43 | If [where] is specified, it controls which machine the process is spawned on. The 44 | default is the local machine. You must have passed a list of machines to init in order 45 | to use `On, or `Random_on. An exception will be raised if you try to use a machine you 46 | didn't pass to init. 47 | 48 | The closure you pass may not contain custom blocks with unimplemented serialization 49 | functions or Abstract values. Anything you can't pass to Marshal, you can't pass to 50 | spawn. 51 | *) 52 | val run 53 | : ?buffer_age_limit:[ `At_most of Time.Span.t | `Unlimited ] 54 | -> ?where:[`Local | `On of string | `F of (unit -> string)] 55 | -> (unit -> 'a Deferred.t) 56 | -> ('a, string) Result.t Deferred.t 57 | 58 | (** [spawn f] spawns a process running [f], supplying [f] a hub that it may use to 59 | communicate with other processes. [f] should listen to the hub to receive messages 60 | from the clients. [spawn] returns a channel connected to [f]'s hub, and a deferred 61 | that will become determined if [f] returns. 62 | 63 | There is no guarantee that the deferred returned by this function will become 64 | determined before the spawned process runs, as such the following code is a race, and 65 | may never return. 66 | 67 | | spawn (fun hub -> Hub.send_to_all hub `Hello; Deferred.never ()) 68 | | >>= fun (channel, _) -> 69 | | Channel.read channel 70 | | >>= fun `Hello -> 71 | | ... 72 | 73 | It IS however guaranteed that the spawned process is listening when the deferred 74 | returned by this function is returned, it is theirfore recommended that the spawning 75 | process initiate the first communication. 76 | 77 | If [where] is specified, it controls which machine the process is spawned on. The 78 | default is the local machine. You must have passed a list of machines to init in order 79 | to use `On, or `Random_on. An exception will be raised if you try to use a machine you 80 | didn't pass to init. 81 | 82 | The closure you pass may not contain custom blocks with unimplemented serialization 83 | functions or Abstract values. Anything you can't pass to Marshal, you can't pass to 84 | spawn. 85 | *) 86 | val spawn 87 | : ?buffer_age_limit:Writer.buffer_age_limit 88 | -> ?where:[`Local | `On of string | `F of (unit -> string)] 89 | -> (('a, 'b) Hub.t -> 'c Deferred.t) 90 | -> (('a, 'b) Channel.t * ('c, string) Result.t Deferred.t) Deferred.t 91 | 92 | (** create a new hub. *) 93 | val hub : ?buffer_age_limit:Writer.buffer_age_limit 94 | -> unit 95 | -> (_, _) Hub.t Deferred.t 96 | 97 | (** returns true if this is a worker machine. See the notes on running on multiple 98 | machines in Std.ml. *) 99 | val is_worker_machine : unit -> bool 100 | 101 | (* Process distribution methods *) 102 | 103 | (* Will spread processes evenly over cluster machines in a round robin way, each spawn 104 | will go to a different cluster machine. Of course its state is local to the current 105 | process, so in a new worker process it will always put a worker created from that 106 | machine on the first host. As a result random may be a better choice for multi level 107 | programs (workers creating workers in a tree like structure) because it does 108 | Random.State.make_self_init in each process. *) 109 | val round_robin : [> `F of (unit -> string)] 110 | 111 | (* Will pick a random cluster machine. If you have a lot of jobs, this will result in a 112 | pretty even distribution (and is useful for testing that your program isn't machine 113 | dependent. *) 114 | val random : [> `F of (unit -> string)] 115 | 116 | (* Will pick a random cluster machine in the specified subset of machines. *) 117 | val random_in : string list -> [> `F of (unit -> string)] 118 | -------------------------------------------------------------------------------- /src/jbuild: -------------------------------------------------------------------------------- 1 | (library 2 | ((name async_parallel_deprecated) 3 | (public_name async_parallel) 4 | (libraries (async 5 | core 6 | sexplib)) 7 | (preprocess (pps (ppx_jane ppxlib.runner))))) 8 | 9 | 10 | (jbuild_version 1) 11 | -------------------------------------------------------------------------------- /src/master_process.ml: -------------------------------------------------------------------------------- 1 | (* The master process cannot use async, because async would start threads. We also need 2 | to throw away any async state that might have been there when we were forked. *) 3 | 4 | open Core 5 | open Import 6 | 7 | module U = Unix 8 | 9 | module File_descr = U.File_descr 10 | 11 | module Request = struct 12 | type t = 13 | | Create_worker_process 14 | | Shutdown 15 | | Heartbeat 16 | end 17 | 18 | module Update = struct 19 | type t = 20 | | Create_worker_process_response of (U.Inet_addr.t * int) Or_error.t 21 | | Heartbeat 22 | end 23 | 24 | exception Cannot_find_available_port 25 | 26 | let syscall th = 27 | let rec loop () = 28 | try th () 29 | with U.Unix_error (EINTR, _, _) -> loop () 30 | in loop () 31 | 32 | let st = lazy (Random.State.make_self_init ()) 33 | let listener () = 34 | let rec loop port = 35 | let s = U.socket ~domain:U.PF_INET ~kind:U.SOCK_STREAM ~protocol:0 in 36 | try 37 | U.bind s ~addr:(U.ADDR_INET (U.Inet_addr.bind_any, port)); 38 | U.listen ~backlog:500 s; 39 | U.set_nonblock s; 40 | U.setsockopt s U.SO_REUSEADDR true; 41 | s, (Lazy.force my_ip, port) 42 | with 43 | | U.Unix_error (EADDRINUSE, _, _) -> 44 | syscall (fun () -> U.close s); 45 | let port = port + 1 + Random.State.int (Lazy.force st) 10 in 46 | if port < 30000 then loop port 47 | else raise Cannot_find_available_port 48 | | exn -> raise exn 49 | in 50 | loop 10000 51 | ;; 52 | 53 | let really_io 54 | (th : ?pos:int -> ?len:int -> File_descr.t -> buf:'a -> int) 55 | fd (buf : 'a) = 56 | let rec loop ~pos ~len = 57 | match syscall (fun () -> th fd ~buf ~pos ~len) with 58 | | 0 -> raise End_of_file 59 | | finished -> 60 | if finished < len then 61 | loop ~pos:(pos + finished) ~len:(len - finished) 62 | in 63 | loop 64 | ;; 65 | 66 | let write c update = 67 | let s = Marshal.to_string update [] in 68 | really_io (U.single_write_substring ~restart:false) c s ~len:(String.length s) ~pos:0 69 | ;; 70 | 71 | let read = 72 | assert (Marshal.header_size < 4096); 73 | let buf = ref (Bytes.create 4096) in 74 | fun c -> 75 | really_io (U.read ~restart:false) c !buf ~pos:0 ~len:Marshal.header_size; 76 | let len = Marshal.data_size !buf 0 in 77 | if len + Marshal.header_size > Bytes.length !buf then begin 78 | let new_buf = Bytes.create (len + Marshal.header_size) in 79 | Bytes.blit 80 | ~src:!buf ~dst:new_buf ~src_pos:0 81 | ~dst_pos:0 ~len:Marshal.header_size; 82 | buf := new_buf 83 | end; 84 | really_io (U.read ~restart:false) c !buf ~pos:Marshal.header_size ~len; 85 | Marshal.from_bytes !buf 0 86 | ;; 87 | 88 | module Signals = struct 89 | module S = Signal 90 | 91 | let existing = ref [] 92 | 93 | let handle ~onexit ~onchld = 94 | let exit_sigs = [S.hup; S.int; S.quit; S.term] in 95 | List.iter exit_sigs ~f:(fun s -> 96 | let existing_behavior = S.Expert.signal s (`Handle onexit) in 97 | (* allow the user to ignore some (or all) exit signals. *) 98 | if existing_behavior = `Ignore then S.ignore s 99 | else existing := (s, existing_behavior) :: !existing); 100 | let existing_chld = S.Expert.signal S.chld (`Handle onchld) in 101 | existing := (S.chld, existing_chld) :: !existing; 102 | let existing_pipe = S.Expert.signal S.pipe `Ignore in 103 | existing := (S.pipe, existing_pipe) :: !existing; 104 | ;; 105 | 106 | let restore () = List.iter !existing ~f:(fun (s, b) -> S.Expert.set s b) 107 | end 108 | 109 | (* This table contains all machines master process addresses *) 110 | let machines = lazy (String.Table.create ()) 111 | 112 | (* Only the master machine has this, it is a list of Unix.Process_info.t for each child 113 | machine the master spawned. *) 114 | let child_machines = ref [] 115 | 116 | (* This is the name of local machine *) 117 | let local_name = ref None 118 | 119 | (* This is the name of the master machine (as found in the machines table) *) 120 | let master_name = ref None 121 | 122 | let create_worker_process () = 123 | (* Create a listening socket for the child so it can communicate with other 124 | processes. We will communicate the address back to the creator. *) 125 | let (s, addr) = listener () in 126 | match syscall (fun () -> U.fork ()) with 127 | | `In_the_parent pid -> 128 | if debug then dbp (sprintf "created worker process with pid %d" (Pid.to_int pid)); 129 | syscall (fun () -> U.close s); 130 | (pid, addr) 131 | | `In_the_child -> 132 | Signals.restore (); 133 | (* We're in the child, so it's OK to use Async. *) 134 | let open Async in 135 | let control_socket = 136 | Socket.of_fd 137 | (Fd.create (Fd.Kind.Socket `Passive) s 138 | (Info.of_string 139 | (sprintf "" 140 | (Pid.to_string (U.getpid ()))))) 141 | Socket.Type.tcp 142 | in 143 | never_returns (Worker_process.run ~control_socket) 144 | ;; 145 | 146 | let talk_machine ip port talk = 147 | let s = U.socket ~domain:U.PF_INET ~kind:U.SOCK_STREAM ~protocol:0 in 148 | let close () = try syscall (fun () -> U.close s) with _ -> () in 149 | try 150 | U.setsockopt s U.TCP_NODELAY true; 151 | let prev = Signal.Expert.signal Signal.alrm (`Handle (fun _ -> close ())) in 152 | ignore (U.alarm 60); 153 | syscall (fun () -> U.connect s ~addr:(U.ADDR_INET (ip, port))); 154 | let res = talk s in 155 | ignore (U.alarm 0); 156 | Signal.Expert.set Signal.alrm prev; 157 | close (); 158 | res 159 | with exn -> close (); Error exn 160 | 161 | let ping_master_machine () = 162 | let last = ref (Time.now ()) in 163 | fun () -> 164 | match !master_name with 165 | | None -> Ok () 166 | | Some name -> 167 | if Time.diff (Time.now ()) !last <= sec 60. then Ok () 168 | else begin 169 | let (ip, port) = Hashtbl.find_exn (Lazy.force machines) name in 170 | talk_machine ip port (fun s -> 171 | write s Request.Heartbeat; 172 | match (read s : Update.t) with 173 | | Update.Heartbeat -> last := Time.now (); Ok () 174 | | Update.Create_worker_process_response _ -> assert false) 175 | end 176 | 177 | let transfer_to close in_fd out_fd = 178 | try 179 | let buf = Bytes.make 512 '\000' in 180 | let len = syscall (fun () -> U.read in_fd ~buf ~len:512 ~pos:0) in 181 | if len = 0 then close () 182 | else really_io (U.single_write ~restart:false) out_fd buf ~len ~pos:0 183 | with _ -> close () 184 | 185 | (* [run] is called exactly once, in the master process, when it is first starting. *) 186 | let run listener : never_returns = 187 | (* The main process may have already done some async stuff that created the async 188 | scheduler. So, we reset async at the start of the master. This puts async in 189 | a pristine state at the start of each worker that the master forks. *) 190 | Async.Scheduler.reset_in_forked_process (); 191 | (* We put a pipe (that we've created) into every call to select. This allows us to wake 192 | up select whenever we want by writing to the pipe. This is the buffer that holds the 193 | wakeup events. It's 50 bytes long, because we write a wakeup event for every time we 194 | get sigchld, and when we read we want to get all the wakeup events at once. *) 195 | (* These settings will cause us to shrink the heap to nearly the actual live data size 196 | when we compact. The smaller the heap the faster fork runs *) 197 | let wakeup_len = 50 in 198 | let wakeup_buf = Bytes.create wakeup_len in 199 | let (wakeup_r, wakeup_w) = U.pipe () in 200 | let select_interrupted = ref false in 201 | let children = Pid.Hash_set.create () in 202 | let clients = U.File_descr.Hash_set.create () in 203 | let ping_master_machine = ping_master_machine () in 204 | let child_machine_stdout = U.File_descr.Hash_set.create () in 205 | let child_machine_stderr = U.File_descr.Hash_set.create () in 206 | List.iter !child_machines ~f:(fun p -> 207 | Hash_set.add child_machine_stdout p.U.Process_info.stdout; 208 | Hash_set.add child_machine_stderr p.U.Process_info.stderr); 209 | let end_the_world exit_status = 210 | if debug then dbp "end the world called"; 211 | Hash_set.iter children ~f:(fun pid -> ignore (Signal.send_i Signal.term (`Pid pid))); 212 | Time.pause (sec 1.0); 213 | Hash_set.iter children ~f:(fun pid -> ignore (Signal.send_i Signal.kill (`Pid pid))); 214 | begin match !master_name with 215 | | Some _ -> () (* We're not the master *) 216 | | None -> (* We should tell all the other machines to shutdown *) 217 | if debug then dbp "we're the master, so sending shutdown to everyone"; 218 | Hashtbl.iteri (Lazy.force machines) ~f:(fun ~key:_ ~data:(ip, port) -> 219 | try 220 | Result.ok_exn (talk_machine ip port (fun s -> write s Request.Shutdown; Ok ())) 221 | with _ -> ()) 222 | end; 223 | Pervasives.exit exit_status 224 | in 225 | Signals.handle 226 | ~onexit:(fun _ -> 227 | if debug then dbp "received exit signal"; 228 | end_the_world 1) 229 | ~onchld:(fun _ -> 230 | if not !select_interrupted then begin 231 | select_interrupted := true; 232 | ignore (syscall (fun () -> U.write wakeup_w ~buf:wakeup_buf ~pos:0 ~len:1)) 233 | end); 234 | try 235 | while true do 236 | let { U.Select_fds.read = read_fds; write=_; except=_ } = 237 | try 238 | let fds = 239 | wakeup_r :: listener 240 | :: Hash_set.to_list child_machine_stdout 241 | @ Hash_set.to_list child_machine_stderr 242 | @ Hash_set.to_list clients 243 | in 244 | syscall (fun () -> 245 | U.select ~read:fds ~write:[] ~except:[] 246 | ~timeout:(`After Time_ns.Span.second) ()) 247 | with 248 | | U.Unix_error (EBADF, _, _) -> 249 | Hash_set.filter_inplace clients ~f:(fun fd -> 250 | try ignore (syscall (fun () -> U.fstat fd)); true 251 | with _ -> 252 | (try syscall (fun () -> U.close fd) with _ -> ()); 253 | false); 254 | U.Select_fds.empty 255 | in 256 | (* if we become a child of init then we should die *) 257 | if U.getppid () = Some Pid.init && !master_name = None then end_the_world 0; 258 | let rec reap () = 259 | match (try syscall (fun () -> U.wait_nohang `Any) with _ -> None) with 260 | | None -> () 261 | | Some (pid, e) -> 262 | if debug then dbp (sprintf "pid %d died with %s\n%!" 263 | (Pid.to_int pid) (Sexp.to_string (U.Exit_or_signal.sexp_of_t e))); 264 | Hash_set.remove children pid; 265 | reap () 266 | in 267 | if !select_interrupted then reap (); 268 | (* If we can no longer contact the master machine, then we should 269 | shutdown. *) 270 | begin match ping_master_machine () with 271 | | Ok () -> () (* We successfully pinged the master server *) 272 | | Error _ -> 273 | if debug then dbp "failed to ping master machine, ending the world"; 274 | end_the_world 0 275 | end; 276 | List.iter read_fds ~f:(fun fd -> 277 | if fd = wakeup_r then begin 278 | ignore (syscall (fun () -> 279 | U.read wakeup_r ~buf:wakeup_buf ~pos:0 ~len:wakeup_len)); 280 | select_interrupted := false; 281 | end else if fd = listener then begin 282 | let rec accept () = 283 | let res = 284 | try Some (U.accept listener) with 285 | | U.Unix_error ((EWOULDBLOCK | EAGAIN | ECONNABORTED | EINTR), _, _) 286 | -> None 287 | in 288 | match res with 289 | | None -> () 290 | | Some (fd, _addr) -> 291 | Hash_set.add clients fd; 292 | accept () 293 | in accept (); 294 | end else if Hash_set.mem child_machine_stdout fd then begin 295 | transfer_to (fun () -> 296 | Hash_set.remove child_machine_stdout fd; 297 | try U.close fd with _ -> ()) fd U.stdout 298 | end else if Hash_set.mem child_machine_stderr fd then begin 299 | transfer_to (fun () -> 300 | Hash_set.remove child_machine_stderr fd; 301 | try U.close fd with _ -> ()) fd U.stderr 302 | end else begin 303 | let v = try `Ok (read fd : Request.t) with _ -> `Read_error in 304 | match v with 305 | | `Read_error -> 306 | Hash_set.remove clients fd; 307 | syscall (fun () -> U.close fd) 308 | | `Ok Request.Shutdown -> 309 | if debug then dbp "master process shutdown requested"; 310 | end_the_world 0; 311 | | `Ok Request.Create_worker_process -> 312 | begin match Or_error.try_with create_worker_process with 313 | | Error _ as r -> write fd (Update.Create_worker_process_response r) 314 | | Ok (pid, addr) -> 315 | Hash_set.add children pid; 316 | write fd (Update.Create_worker_process_response (Ok addr)) 317 | end 318 | | `Ok Request.Heartbeat -> write fd Update.Heartbeat 319 | end) 320 | done; 321 | end_the_world 2 322 | with exn -> 323 | if debug then 324 | Debug.log_string 325 | (sprintf "master died with unhandled exception %s" (Exn.to_string exn)); 326 | if debug then 327 | dbp (sprintf "ending the world on unhandled exception %s" (Exn.to_string exn)); 328 | end_the_world 3 329 | ;; 330 | 331 | exception Error_initializing_worker_machine of string * exn [@@deriving sexp] 332 | 333 | module Worker_machines = struct 334 | module To_worker_machine = struct 335 | type t = { 336 | master_name: string; 337 | machines: (U.Inet_addr.Blocking_sexp.t * int) String.Table.t; 338 | } [@@deriving sexp] 339 | end 340 | 341 | module From_worker_machine = struct 342 | type t = (U.Inet_addr.Blocking_sexp.t * int) [@@deriving sexp] 343 | end 344 | 345 | (* We're a worker machine, not the master. *) 346 | let as_worker_machine worker_name = 347 | local_name := Some worker_name; 348 | (* Take out the trash *) 349 | let our_binary = U.readlink (sprintf "/proc/%d/exe" (Pid.to_int (U.getpid ()))) in 350 | U.unlink our_binary; 351 | U.rmdir (Filename.dirname our_binary); 352 | let master_cwd = Option.value_exn (Sys.getenv "ASYNC_PARALLEL_MASTER_CWD") in 353 | if Sys.file_exists ~follow_symlinks:true master_cwd = `Yes 354 | && Sys.is_directory ~follow_symlinks:true master_cwd = `Yes 355 | && U.access master_cwd [`Exec] = Ok () 356 | then U.chdir master_cwd 357 | else U.chdir "/"; 358 | let (listening_socket, addr) = listener () in 359 | (* Tell the master machine our address. *) 360 | printf "%s\n%!" (Sexp.to_string_mach (From_worker_machine.sexp_of_t addr)); 361 | (* Accept one connection so we can read everyone else's address from the master 362 | machine *) 363 | let rec accept () = 364 | let res = 365 | try Some (U.accept listening_socket) with 366 | | U.Unix_error ((EWOULDBLOCK | EAGAIN | ECONNABORTED | EINTR), _, _) -> 367 | None 368 | in 369 | match res with 370 | | None -> Time.pause (sec 0.1); accept () 371 | | Some (fd, _) -> fd 372 | in 373 | let s = accept () in 374 | let ic = U.in_channel_of_descr s in 375 | let l = Option.value_exn (In_channel.input_line ic) in 376 | let m = To_worker_machine.t_of_sexp (Sexp.of_string l) in 377 | In_channel.close ic; 378 | let tbl = Lazy.force machines in 379 | Hashtbl.clear tbl; 380 | Hashtbl.iteri m.To_worker_machine.machines ~f:(fun ~key ~data -> 381 | Hashtbl.set tbl ~key ~data); 382 | master_name := Some (m.To_worker_machine.master_name); 383 | never_returns (run listening_socket) 384 | ;; 385 | 386 | (* All hail the great designers of the unix shell and operating system. *) 387 | let cmd bin_name cwd local_name = 388 | let async_config_set_var = 389 | let var = Async.Async_config.environment_variable in 390 | match Sys.getenv var with 391 | | None -> "" 392 | | Some value -> sprintf "%s=%S" var value 393 | in 394 | sprintf "( 395 | US=$(mktemp -d) 396 | EXE=${US}/%s 397 | # the master machine is going to send then binary across stdin, and 398 | # then close it, which will cause cat to exit, and the binary to run. 399 | # we don't need stdin anyway, so it's fine if it gets closed. 400 | cat >\"$EXE\" 401 | chmod 700 \"$EXE\" 402 | %s \\ 403 | ASYNC_PARALLEL_MASTER_CWD=\"%s\" \\ 404 | ASYNC_PARALLEL_IS_CHILD_MACHINE=\"%s\" \\ 405 | \"$EXE\" 422 | let module P = U.Process_info in 423 | let p = 424 | U.create_process ~prog:"ssh" 425 | ~args:(ssh_options @ [machine; cmd (Filename.basename our_binary) our_cwd machine]) 426 | in 427 | try 428 | really_io (U.single_write_substring ~restart:false) p.P.stdin us 429 | ~len:(String.length us) ~pos:0; 430 | U.close p.P.stdin; 431 | let ic = U.in_channel_of_descr p.P.stdout in 432 | let (addr, port) = 433 | (* The program might say other stuff during startup on stdout, just skip until 434 | we find our sexp. *) 435 | let rec loop () = 436 | let line = Option.value_exn (In_channel.input_line ic) in 437 | try From_worker_machine.t_of_sexp (Sexp.of_string line) 438 | with _ -> loop () 439 | in loop () 440 | in 441 | child_machines := p :: !child_machines; 442 | Hashtbl.set tbl ~key:machine ~data:(addr, port); 443 | (addr, port) 444 | with e -> 445 | (try ignore (Signal.send Signal.kill (`Pid p.P.pid) : [`No_such_process | `Ok]) 446 | with _ -> ()); 447 | (try U.close p.P.stdout with _ -> ()); 448 | (try U.close p.P.stderr with _ -> ()); 449 | (try ignore (U.wait_nohang (`Pid p.P.pid) : (Pid.t * U.Exit_or_signal.t) option) 450 | with _ -> ()); 451 | raise (Error_initializing_worker_machine (machine, e))) 452 | in 453 | let m = 454 | { To_worker_machine. 455 | machines = tbl; 456 | master_name = cluster.Cluster.master_machine } 457 | in 458 | let sexp = Sexp.to_string_mach (To_worker_machine.sexp_of_t m) in 459 | List.iter addrs ~f:(fun (ip, port) -> 460 | Result.ok_exn 461 | (talk_machine ip port (fun s -> 462 | let oc = U.out_channel_of_descr s in 463 | Out_channel.output_lines oc [sexp]; 464 | Out_channel.flush oc; 465 | Ok ()))) 466 | end 467 | 468 | let init ?cluster ?(close_stdout_and_stderr = false) () = 469 | let worker_name = Sys.getenv "ASYNC_PARALLEL_IS_CHILD_MACHINE" in 470 | if worker_name <> None then 471 | Worker_machines.as_worker_machine (Option.value_exn worker_name) 472 | else if Hashtbl.length (Lazy.force machines) > 0 then 473 | failwith "Master process already initialized" 474 | else begin 475 | let (listening_socket, addr) = listener () in 476 | begin match cluster with 477 | | None -> 478 | local_name := Some "local"; 479 | Hashtbl.set (Lazy.force machines) ~key:(Option.value_exn !local_name) 480 | ~data:addr; 481 | | Some c -> 482 | local_name := Some c.Cluster.master_machine; 483 | Hashtbl.set (Lazy.force machines) ~key:(Option.value_exn !local_name) 484 | ~data:addr; 485 | try Worker_machines.init c 486 | with e -> 487 | (* Set up our state so it can be initialized again *) 488 | (try U.close listening_socket with _ -> ()); 489 | local_name := None; 490 | Hashtbl.clear (Lazy.force machines); 491 | raise e 492 | end; 493 | match U.fork () with 494 | | `In_the_child -> 495 | (* The master process *) 496 | begin 497 | if close_stdout_and_stderr then 498 | (Unix.(close stdout); Unix.(close stderr)) 499 | end; 500 | never_returns (run listening_socket); 501 | | `In_the_parent master_pid -> 502 | (* The main process *) 503 | if debug then 504 | dbp (sprintf "created master process with pid %s" (Pid.to_string master_pid)); 505 | syscall (fun () -> U.close listening_socket); 506 | master_pid; 507 | end 508 | ;; 509 | 510 | (* Code below here does not run in the master process, and so can use async. *) 511 | open Async 512 | 513 | type host = { 514 | q: (Update.t, string) Result.t Ivar.t Queue.t; 515 | mutable listening: bool; 516 | mutable conn: [ `Connecting of unit Deferred.t 517 | | `Connected of (Reader.t * Writer.t) 518 | | `Not_connected ]; 519 | } 520 | 521 | let create () = {q = Queue.create (); listening = false; conn = `Not_connected} 522 | 523 | let hosts : host String.Table.t = String.Table.create () 524 | 525 | let kill ?(e = Error "killed") h = 526 | begin match h.conn with 527 | | `Not_connected -> assert false 528 | | `Connecting _ -> assert false 529 | | `Connected (r, w) -> 530 | ignore (Monitor.try_with (fun () -> Writer.close w)); 531 | ignore (Monitor.try_with (fun () -> Reader.close r)) 532 | end; 533 | h.listening <- false; 534 | h.conn <- `Not_connected; 535 | Queue.iter h.q ~f:(fun i -> Ivar.fill i e); 536 | Queue.clear h.q 537 | ;; 538 | 539 | let listen h = 540 | if h.listening then () 541 | else begin 542 | let rec loop () = 543 | match h.conn with 544 | | `Connecting _ -> assert false 545 | | `Not_connected -> assert false 546 | | `Connected (r, _) -> 547 | Monitor.try_with (fun () -> 548 | (Reader.read_marshal r : Update.t Reader.Read_result.t Deferred.t)) 549 | >>> function 550 | | Error e -> kill ~e:(Error (Exn.to_string e)) h 551 | | Ok `Eof -> kill ~e:(Error "Eof") h 552 | | Ok (`Ok a) -> 553 | Ivar.fill (Queue.dequeue_exn h.q) (Ok a); 554 | (* we don't keep a persistent connection to the master process, 555 | since many processes on many machines may be talking to it, 556 | and we don't want to run it out of file descriptors. *) 557 | if Queue.is_empty h.q then kill h else loop () 558 | in 559 | h.listening <- true; 560 | loop () 561 | end 562 | ;; 563 | 564 | let rec connected_host host f = 565 | let machines = Lazy.force machines in 566 | let (addr, port) = Hashtbl.find_exn machines host in 567 | let connect h = 568 | let s = Socket.create Socket.Type.tcp in 569 | let rw = 570 | socket_connect_inet s (addr, port) >>| fun s -> 571 | let w = Writer.create (Socket.fd s) in 572 | let r = Reader.create (Socket.fd s) in 573 | (r, w) 574 | in 575 | h.conn <- `Connecting (rw >>| fun (_r, _w) -> ()); 576 | rw >>| fun (r, w) -> 577 | h.conn <- `Connected (r, w); 578 | (h, w) 579 | in 580 | match Hashtbl.find_or_add hosts host ~default:create with 581 | | {conn = `Connecting rw; _} -> (rw >>= fun () -> connected_host host f) 582 | | {conn = `Not_connected; _} as h -> (connect h >>= fun _ -> connected_host host f) 583 | | {conn = `Connected (_, w); _} as h -> f (h, w) 584 | ;; 585 | 586 | let choose_machine where = 587 | let tbl = Lazy.force machines in 588 | if Hashtbl.length tbl = 0 then failwith "Parallel.init not called"; 589 | match where with 590 | | `Local -> Option.value_exn !local_name 591 | | `On host -> 592 | if Hashtbl.mem tbl host then host 593 | else failwithf "unknown host %s" host () 594 | | `F f -> 595 | let host = f () in 596 | if Hashtbl.mem tbl host then host 597 | else failwithf "unknown host %s" host () 598 | ;; 599 | 600 | let create_process ?(where = `Local) () = 601 | connected_host (choose_machine where) (fun (h, w) -> 602 | Deferred.create (fun i -> 603 | Queue.enqueue h.q i; 604 | write_marshal w ~flags:[] Request.Create_worker_process; 605 | listen h)) 606 | >>| function 607 | | Error e -> Error (Error.of_string e) 608 | | Ok (Update.Create_worker_process_response addr) -> addr 609 | | Ok Update.Heartbeat -> assert false 610 | ;; 611 | 612 | let shutdown () = 613 | if debug then dbp "Master_process.shutdown called"; 614 | connected_host (Option.value_exn !local_name) (fun (h, w) -> 615 | write_marshal w ~flags:[] Request.Shutdown; 616 | Writer.flushed w 617 | >>| fun () -> 618 | kill h; 619 | Hashtbl.clear (Lazy.force machines); 620 | child_machines := []; 621 | local_name := None; 622 | master_name := None) 623 | ;; 624 | 625 | 626 | (* Process distribution methods *) 627 | 628 | let round_robin = 629 | let i = ref 0 in 630 | `F (fun () -> 631 | let tbl = Lazy.force machines in 632 | let a = Array.of_list (Hashtbl.keys tbl) in 633 | let current = !i in 634 | incr i; 635 | if !i >= Array.length a then i := 0; 636 | a.(current)) 637 | 638 | let random = 639 | `F (fun () -> 640 | let tbl = Lazy.force machines in 641 | let st = Lazy.force st in 642 | let a = Array.of_list (Hashtbl.keys tbl) in 643 | a.(Random.State.int st (Array.length a))) 644 | 645 | let random_in set = 646 | List.iter set ~f:(fun s -> 647 | if not (Hashtbl.mem (Lazy.force machines) s) then 648 | failwithf "unknown host in host set %s" s ()); 649 | `F (fun () -> 650 | let st = Lazy.force st in 651 | let a = Array.of_list set in 652 | a.(Random.State.int st (Array.length a))) 653 | -------------------------------------------------------------------------------- /src/master_process.mli: -------------------------------------------------------------------------------- 1 | (* The master process is a child of the main process that is using the parallel library. 2 | It is responsible for the worker processes, which are in turn its children. *) 3 | 4 | open! Core 5 | open! Async 6 | open! Import 7 | 8 | (* Init will raise this if any worker machine in the cluster fails initialization. It is 9 | safe to catch this exception and call init again. *) 10 | exception Error_initializing_worker_machine of string * exn [@@deriving sexp] 11 | 12 | (** [init] initializes the system and creates the master process. [master_init], if 13 | specified, is called in the master process and may be used for cleanup/initialization 14 | such as closing file descriptors. [init] should be called before any threads are 15 | created. If your program daemonizes, call [init] after you daemonize, but before you 16 | start the async scheduler. [init] may only be called once. 17 | 18 | If [cluster] is specified, and it specifies a set of host names or ips that you can 19 | ssh to without a password, and have permission to run programs on, then a master 20 | process will also be started on these machines, and you will be able to spawn worker 21 | processes on these machines. The current machine is implicitly included in the set of 22 | available machines. 23 | *) 24 | val init : 25 | ?cluster:Cluster.t -> 26 | ?close_stdout_and_stderr:bool -> 27 | unit -> 28 | Pid.t (* of the master process *) 29 | 30 | (** All the functions below are called in either the main process or a worker process. *) 31 | 32 | 33 | (** Request a new process. Returns the ip/port where the new process may be reached. If 34 | specified [where] determines which machine the process will be spawned on, the default 35 | is `Local (the current machine). *) 36 | val create_process : 37 | ?where:[`Local | `On of string | `F of (unit -> string)] 38 | -> unit 39 | -> (Unix.Inet_addr.t * int) Or_error.t Deferred.t 40 | 41 | (** Tell the master process to shutdown. *) 42 | val shutdown : unit -> unit Deferred.t 43 | 44 | (* Process distribution methods *) 45 | val round_robin : [> `F of (unit -> string)] 46 | val random : [> `F of (unit -> string)] 47 | val random_in : string list -> [> `F of (unit -> string)] 48 | -------------------------------------------------------------------------------- /src/std.ml: -------------------------------------------------------------------------------- 1 | (** DEPRECATION WARNING: Async_parallel has been deprecated in favor of Rpc_parallel. *) 2 | 3 | 4 | (** [Parallel] is a library for running tasks in other processes on a cluster of machines. 5 | At its simplest, it exposes the [Parallel.run] function: 6 | 7 | | val run : ?where:[`Local | `On of string | `Random | `Random_on of string list] 8 | -> (unit -> 'a Deferred.t) -> ('a, string) Result.t Deferred.t 9 | 10 | where [run f] creates another process on the machine specified by [where] whose sole 11 | job is to compute [f ()]. The process that calls [f] will receive the result of [f 12 | ()] when it finishes. Note that [f] itself could call [run], thus allowing an 13 | arbitrarily nested processes across arbitrary machines in the cluster. 14 | 15 | In order to use, [Parallel.run], for technical reasons, one must first call 16 | [Parallel.init], which must be called before any threads are created and before 17 | Async's scheduler is started. 18 | 19 | Parallel's "hubs" and "channels" support typed bidirectional communication of streams 20 | of data between process. One process creates a hub and listens to it. Any other 21 | processes can then open a channel to the hub, and write values on the channel that 22 | will be received by the process listening to the hub. Similarly, the hub process can 23 | send values via a channel to the process that opened the channel. 24 | 25 | Moreover, channels may be passed between processes, either implicitly by being 26 | captured in a closure, or explicitly over another channel. 27 | 28 | Implementation overview 29 | ======================= 30 | There are three kinds of processes involved in a program the uses Parallel: 31 | 32 | - the main process 33 | - the master process 34 | - worker processes 35 | 36 | Parallel dynamically creates a worker process to service each call to [run]. 37 | 38 | The OS process tree looks like: 39 | 40 | | main 41 | | master 42 | | worker1 43 | | ... 44 | | workerN 45 | 46 | As far as the OS is concerned, all workers are children of the master. However, from 47 | the perspective of Parallel, the topology is more structured. Each worker process is 48 | created on behalf of its "owner" process, which is either the main process or another 49 | worker process. One can think of the main and worker processes as arranged in a tree 50 | different than the OS tree, in which there is an edge from each process to its owner 51 | (the main process has no owner). 52 | 53 | Parallel uses OCaml's [Marshal] library to serialize OCaml values to and from strings 54 | so that they can be sent over unix sockets between processes. For example, the [f] 55 | supplied to [run] is marshalled and sent from the process that calls [run] to the 56 | worker process that will run [f]. Most, but not all values can be marshaled. Examples 57 | of values that can't be marshaled include C allocated abstract tagged values, custom 58 | blocks with no serialize/deserialize method. 59 | 60 | The main process and all worker processes have a socket connected to the master 61 | process. The master process's sole job is to service requests that are sent to 62 | these sockets, which can ask it to: 63 | 64 | - create a new "worker process", via [create_process] 65 | 66 | As the master process receives requests, it does what each request asks, and then 67 | sends a response back via the socket to the client that made the request. 68 | 69 | Each worker process has a socket connected to its owner process. This socket 70 | initially receives the [f] that the worker is to run, and is ultimately used to send 71 | the result back from the worker to the owner. 72 | 73 | Here are the steps involved in implementing [run f]. There are three processes 74 | involved. 75 | 76 | - R = the process calling [run] 77 | - M = the master process 78 | - W = the worker process running the task 79 | 80 | The steps are: 81 | 82 | 1. R asks M to create W 83 | 2. M forks W 84 | 3. M tells R about W 85 | 4. R sends [f] to W to run 86 | 5. W runs [f] 87 | 6. W sends the result of [f] to R 88 | 7. M notices W has exited, and cleans up 89 | 90 | When there are multiple machines in a cluster, each machine has a master process, 91 | and all the workers know about all master processes. When a worker wants to run on 92 | machine M, it looks up the address of that machine's master process in its table 93 | before performing step 1, everything after that is exactly the same as the example. 94 | 95 | Notes: 96 | 97 | Channel Passing 98 | --------------- 99 | 100 | When a channel is passed from one process to another, the open socket is not actually 101 | passed. The API makes this pretty transparent, any api call will reconnect the 102 | channel, but it is useful to be aware of what is really going on as if you aren't 103 | aware you may create a race condition. For example, if I spawn a worker connected to a 104 | hub I have, and then I immediately send something, it may or may not arrive, because 105 | the worker may not have time to connect and receive it. A better strategy is to wait 106 | for the worker to say hello, and then send the data. This also means that you might 107 | have created only one channel from a given hub, but you can end up with as many 108 | connections (client ids) as workers who got hold of that channel. You can address them 109 | all individually, or you can always use [send_to_all] if you really want to model a 110 | hub as a kind of shared bus. 111 | 112 | Stdout and Stderr 113 | ----------------- 114 | 115 | Care has been taken to make printf style debugging work transparently with parallel, 116 | even when run on a multiple machine cluster, stdout and stderr will be forwarded back 117 | to the master machine. This can cause some interleaving if you print a lot of 118 | messages, but generally works reasonably well (and we read and write in big chunks, so 119 | most of the interleaving won't be interline). 120 | 121 | Some things to avoid marshaling 122 | ------------------------------- 123 | 124 | Monitor.t, Pcre.regexp, Writer.t, Reader.t, and similar kinds of objects shouldn't be 125 | depended upon to marshal correctly. Pcre.regexp is just right out, it definitely won't 126 | work. Monitor.t, Writer.t, and Reader.t, because of their complex nature, generally 127 | tow the entire async scheduler along with them, and because of that they will fail if 128 | any job on the scheduler queue has a custom object (e.g. regexp, or other C object) 129 | that can't be marshaled. You also can't marshal functions you've dynamically loaded 130 | (e.g. with ocaml plugin). 131 | 132 | Processes don't share memory 133 | ---------------------------- 134 | 135 | The library can make it look very transparent to create and use other processes, but 136 | please remember these can literally be on some other machine maybe halfway round the 137 | earth. Global variables you set in one worker process have no effect whatsoever on 138 | other worker processes. I've personally come to believe that this is good, it results 139 | in better designed, more scalable systems. 140 | 141 | Big shared things 142 | ----------------- 143 | 144 | Because of the way parallel works, with the master process an image of a very early 145 | state of one's program and workers forked from the master, it is usually not possible 146 | to share big static things in the way one might do in C using fork. Moreover, it isn't 147 | necessarily a win as you might think, if you know about how unix only copies pages 148 | on write when a process forks, you know that it should be a win. But the garbage 149 | collector ruins that completely, because as it scans it will write to EVERY page, 150 | causing a copy on write fault to copy the page, so you'll end up with a non shared 151 | copy of that big static thing in every process anyway. The best you can probably do is 152 | have one process own it and expose it with a query interface. Moreover, if you're 153 | running on multiple machines that IS the best you can do, so may as well get used to 154 | it. 155 | 156 | Why Not Just Fork!? 157 | ------------------- 158 | 159 | The unix savvy among you may ask, what the heck are you doing with master processes 160 | and closure passing, just fork! Oh how that would make life easier, but alas, it 161 | really isn't possible. Why? You can't write async without threads, because the Unix 162 | API doesn't provide an asynchronous system call for every operation, meaning if you 163 | need to do something that might block, you must do it in a thread. And the list of 164 | stuff that might block is long and crippling. Want to read from a file without 165 | blocking out for SECONDS? Sorry! Not without a thread you don't. But once you've 166 | started a thread, all bets are off if you fork. POSIX actually doesn't even say 167 | anything about what happens to threads in a process that forks (besides saying they 168 | don't think its a good idea). In every sane OS, only the forking thread continues in 169 | the child, all the other threads are dead. OK, fine you say, let them die. But their 170 | mutexes, semaphores and condition variables are in whatever state they were in the 171 | moment you forked, that is to say, any state at all. Unfortunately this means that 172 | having created a thread that does anything meaningful (e.g. calls into libc), if you 173 | fork, all bets are off as to what happens in that child process. A dead thread may, 174 | for example, hold the lock around the C heap, which would mean that any call into libc 175 | would deadlock trying to allocate memory (oops), that'd ruin your day. Trust me, if 176 | parallel could work in some simpler way it would! 177 | 178 | Say I Want To Have a Giant Shared Matrix 179 | ---------------------------------------- 180 | 181 | The parallelism model implemented is strictly message passing, shared memory isn't 182 | implemented, but there are various hacks you could use to make this work 183 | (e.g. implement it yourself). Bigarray already allows mmaping files, so in theory 184 | even a cluster of machines could all mmap a giant file and use/mutate it. 185 | 186 | Making Your Program Work on Multiple Machines 187 | --------------------------------------------- 188 | 189 | The library makes this pretty transparent, however there are a couple of things to 190 | watch out for if you want it to work seamlessly. First of all, your program should be 191 | able to run with no arguments. Ideally you'd call Parallel.init before parsing 192 | arguments, or at least you'd check to see if you're a worker machine before parsing 193 | arguments. The reason for this is that the library is going to copy your program to 194 | every machine in the cluster and start it up, it's going to set an environment 195 | variable, and Parallel.init is going to check that environment variable and do 196 | something completely different if it's set, in fact Parallel.init will never return in 197 | this scenario, but will instead become the master process for that machine. If you 198 | want to change your behavior based on whether you're running a worker machine or the 199 | master you can use Parallel.is_worker_machine. In general put Parallel.init as early 200 | in your program as possible. 201 | 202 | Try to avoid printing crazy things like sexps, or tons of data to stdout before 203 | calling Parallel.init. It uses stdout to communicate its address back to the master 204 | machine. The parser is pretty robust, and will toss out most things you print, but if 205 | you happen to print just the right sexp, it might think you're at the wrong 206 | address. This would just cause startup to hang, but would probably be hard to debug. 207 | 208 | The examples (in the examples directory) all work on multiple machines, if you're 209 | stumped for a template to follow. 210 | 211 | Why Can't I Use Async Before Parallel.init? 212 | ------------------------------------------- 213 | 214 | By default Parallel.init does a check that you haven't created any threads, and that 215 | you haven't made any use of async. The threads check is mandatory, but the async check 216 | can be turned off by setting [fail_if_async_has_been_initialized] to false. Why is 217 | this check the default? Well in general you can't initialize async libraries before 218 | calling Parallel.init and expect them to work in the child process. The reason is that 219 | the async scheduler is thrown away in the child process before calling 220 | Scheduler.go. This is out of necessity, there is no way we can know what state the 221 | scheduler is in at the moment we fork, and it would be quite unfortunate if it were in 222 | a bad state, or worse, there are jobs on the queue that get run in all workers as soon 223 | as they call Scheduler.go. But as a result of this, any asyncy thing you create before 224 | Parallel.init won't work in worker processes. For example, say you initialize the log 225 | module before Parallel.init expecting to use it in the workers. It won't work, since 226 | all of its state (loops, writers, etc) is invalid in the worker processes. The check 227 | exists to make sure people are aware of this, and to make sure it only happens if they 228 | really know it's ok. 229 | 230 | What CWD Will Worker Machine Processes Have? 231 | -------------------------------------------- 232 | 233 | If the CWD of the master exists on the worker machine, and you have permission to 234 | enter it, then parallel will switch to that directory before starting the master 235 | process, otherwise it will chdir to /. 236 | *) 237 | 238 | module Parallel = Intf 239 | module Channel = Channel 240 | module Hub = Hub 241 | module Cluster = Import.Cluster 242 | 243 | (* module Shm : sig 244 | * type 'a descriptor 245 | * val copy_to : 'a -> 'a descriptor Async.Deferred.t 246 | * val reify : 'a descriptor -> 'a Async.Deferred.t 247 | * end = Shm *) 248 | 249 | (* This toplevel module expression only runs in processes spawned by 250 | init_child_machines. *) 251 | -------------------------------------------------------------------------------- /src/token.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | (** Token is a mechanism to detect when an object is not in the process where it was 4 | created. Every process has a token, which occupies some area of physical 5 | memory. Every object we wish to track has a pointer to it's processes token. Checking 6 | whether the object is being used in another process than the one that created it is 7 | then a simple matter of checking physical equality of the executing processes' token 8 | against the token stored in the object. If they don't match, then the object has been 9 | moved to another process. This method avoids the pitfalls of using pids, which are 10 | reused on a short time scale by the OS. *) 11 | 12 | type t = {v: unit} 13 | 14 | (** It's true that because we allocate this block at module init time every worker process 15 | will have [mine] at the same address. However this is ok, because when we marshal 16 | something it will make a deep copy of mine, and so we can still detect that we're not 17 | in the same process as where we were created. *) 18 | let mine = {v = ()} 19 | let valid tok = phys_equal tok mine 20 | 21 | -------------------------------------------------------------------------------- /src/token.mli: -------------------------------------------------------------------------------- 1 | 2 | type t 3 | 4 | val mine: t 5 | val valid: t -> bool 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/worker_process.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Import 4 | 5 | module To_worker = struct 6 | type ('a, 'b, 'c) t = 7 | | Run of Writer.buffer_age_limit option * (('a, 'b) Hub.t -> 'c Deferred.t) 8 | end 9 | 10 | module From_worker = struct 11 | type 'a t = 12 | | Result of 'a 13 | | Exn of string 14 | end 15 | 16 | let go (type a) (type b) (type c) ~control_socket = 17 | if debug then dbp "worker process starting"; 18 | let monitor = Monitor.create () in 19 | Stream.iter (Monitor.detach_and_get_error_stream monitor) ~f:(fun e -> 20 | if debug then 21 | dbp (sprintf "Worker_process: unhandled exception %s" (Exn.to_string e)); 22 | Shutdown.shutdown 1); 23 | within ~monitor (fun () -> 24 | if debug then dbp "Worker_process.run"; 25 | Socket.accept control_socket >>> function 26 | | `Socket_closed -> () 27 | | `Ok (client, _) -> 28 | let fd = Socket.fd client in 29 | let owner_reader = Reader.create fd in 30 | let owner_writer = Writer.create fd in 31 | Reader.read_marshal_raw owner_reader 32 | >>> function 33 | | `Eof -> 34 | if debug then dbp "Worker_process: Eof reading job"; 35 | Shutdown.shutdown 2 36 | | `Ok x -> 37 | if debug then dbp "worker process read request"; 38 | match (Marshal.from_bytes x 0 : (a, b, c) To_worker.t) with 39 | | To_worker.Run (buffer_age_limit, f) -> 40 | if debug then dbp "got run request, creating hub"; 41 | Hub.create ?buffer_age_limit control_socket 42 | >>> fun hub -> 43 | if debug then dbp "running f"; 44 | (Monitor.try_with (fun () -> f hub) 45 | >>| (function 46 | | Error e -> 47 | if debug then dbp "result is exn"; 48 | From_worker.Exn (Exn.to_string e) 49 | | Ok r -> 50 | if debug then dbp "result is success"; 51 | From_worker.Result r)) 52 | >>> fun res -> 53 | if debug then dbp "writing result"; 54 | write_marshal ~flags:[Marshal.Closures] owner_writer 55 | (res : c From_worker.t); 56 | Writer.flushed owner_writer 57 | >>> fun _ -> 58 | if debug then dbp "wrote result"; 59 | shutdown 0) 60 | 61 | let run ~control_socket = 62 | go ~control_socket; 63 | Scheduler.go (); 64 | ;; 65 | -------------------------------------------------------------------------------- /src/worker_process.mli: -------------------------------------------------------------------------------- 1 | (** This interface is not intended to be used directly by users! 2 | 3 | The master process forks a worker process for each task that Parallel is asked to do. 4 | After forking, in the worker process, the master calls [Worker_process.run], which: 5 | 6 | 1. accepts a single connection from its [control_socket]. 7 | 2. reads a single marshaled [To_worker.Run f] message over the connection. 8 | 3. creates a hub with [control_socket] and supplies it to [f]. 9 | 4. when [f] returns, writes a single [From_worker] message with the result over 10 | the [control_socket] connection. 11 | 5. exits. 12 | *) 13 | 14 | open! Core 15 | open! Async 16 | 17 | module To_worker : sig 18 | type ('a, 'b, 'c) t = 19 | | Run of Writer.buffer_age_limit option * (('a, 'b) Hub.t -> 'c Deferred.t) 20 | end 21 | 22 | module From_worker : sig 23 | type 'a t = 24 | | Result of 'a 25 | | Exn of string 26 | end 27 | 28 | val run 29 | : control_socket:([`Passive], Socket.Address.Inet.t) Socket.t 30 | -> never_returns 31 | -------------------------------------------------------------------------------- /test/jbuild: -------------------------------------------------------------------------------- 1 | (alias 2 | ((name runtest) 3 | (deps (qtest.exe test_remote.exe)) 4 | (action (bash "./qtest.exe")))) 5 | 6 | (executables 7 | ((names (qtest test_remote)) 8 | (libraries (async 9 | async_parallel_deprecated 10 | core 11 | qtest_lib)) 12 | (preprocess (pps (ppx_jane ppxlib.runner))))) 13 | 14 | 15 | (jbuild_version 1) 16 | -------------------------------------------------------------------------------- /test/parallel_test.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let wide () = 6 | Deferred.List.iter [ 1; 2; 10; 100 ] ~f:(fun num_children -> 7 | Deferred.all 8 | (List.init num_children ~f:(fun i -> 9 | Parallel.run (fun () -> return i) 10 | >>| function 11 | | Error e -> failwith e 12 | | Ok i -> i)) 13 | >>| fun l -> 14 | assert (l = List.init num_children ~f:Fn.id)) 15 | ;; 16 | 17 | let deep () = 18 | Deferred.List.iter [ 1; 2; 10; 100 ] ~f:(fun depth -> 19 | let rec loop i = 20 | if i = 0 21 | then return 0 22 | else begin 23 | Parallel.run (fun () -> loop (i - 1)) 24 | >>| function 25 | | Error e -> failwith e 26 | | Ok j -> j + 1 27 | end 28 | in 29 | loop depth 30 | >>| fun d -> 31 | assert (d = depth)) 32 | ;; 33 | 34 | let fib () = 35 | let rec fib n = 36 | Parallel.run (fun () -> 37 | match n with 38 | | 0 -> return 0 39 | | 1 -> return 1 40 | | n -> 41 | fib (n - 1) 42 | >>= fun f1 -> 43 | fib (n - 2) 44 | >>= fun f2 -> 45 | return (f1 + f2)) 46 | >>| function 47 | | Error e -> failwith e 48 | | Ok n -> n 49 | in 50 | fib 10 51 | >>| fun n -> 52 | assert (n = 55) 53 | ;; 54 | 55 | let ring () = 56 | let add c f = 57 | Parallel.spawn (fun h -> 58 | let ready = Ivar.create () in 59 | (Pipe.iter_without_pushback (Hub.listen h) ~f:(function 60 | | `Connect _ -> Ivar.fill_if_empty ready () 61 | | _ -> ()) >>> ignore); 62 | let rec loop () = 63 | Channel.read c >>> fun a -> 64 | f a >>> fun b -> 65 | Ivar.read ready >>> fun () -> 66 | Hub.send_to_all h b; 67 | Hub.flushed h 68 | >>> fun () -> 69 | loop () 70 | in 71 | loop (); 72 | Deferred.never ()) 73 | >>| fun (c, _) -> c 74 | in 75 | let main () = 76 | Parallel.hub () >>= fun hub -> 77 | let ready = Ivar.create () in 78 | (Pipe.iter_without_pushback (Hub.listen hub) ~f:(function 79 | | `Connect _ -> Ivar.fill_if_empty ready () 80 | | _ -> ()) >>> ignore); 81 | Hub.open_channel hub >>= fun c -> 82 | add c (fun x -> return (x + 1)) >>= fun c -> 83 | add c (fun x -> return (x + 2)) >>= fun c -> 84 | add c (fun x -> return (x + 3)) >>= fun c -> 85 | let rec loop i = 86 | if i >= 1000 then Deferred.unit 87 | else begin 88 | let res = Channel.read c in 89 | Ivar.read ready >>= fun () -> 90 | Hub.send_to_all hub i; 91 | Hub.flushed hub 92 | >>= fun () -> 93 | res >>= fun i -> 94 | loop (i + 4) 95 | end 96 | in 97 | loop 0 >>= fun () -> 98 | Hub.close hub 99 | in 100 | main () 101 | ;; 102 | 103 | let echo_server () = 104 | let echo_server s = 105 | Deferred.create (fun i -> 106 | Pipe.iter' (Hub.listen s) ~f:(fun q -> 107 | Deferred.all_unit 108 | (Queue.fold q ~init:[] ~f:(fun acc x -> 109 | match x with 110 | | `Connect _ -> 111 | Deferred.unit :: acc 112 | | `Data (c, `Echo a) -> 113 | Hub.send s c (`Echo a); 114 | Hub.flushed s :: acc 115 | | `Data (_, `Die) -> 116 | Ivar.fill i (); 117 | Deferred.unit :: acc 118 | | `Disconnect _ -> Deferred.unit :: acc))) 119 | >>> fun () -> Ivar.fill_if_empty i ()) 120 | in 121 | let main () = 122 | Parallel.spawn echo_server >>= fun (c, res) -> 123 | Channel.write c (`Echo "foo"); 124 | Channel.read c >>= fun (`Echo z) -> 125 | assert (z = "foo"); 126 | Deferred.create (fun iv -> 127 | let rec loop i = 128 | if i >= 100 then Ivar.fill iv () 129 | else begin 130 | Parallel.run (fun () -> 131 | let id = Int.to_string i in 132 | Channel.write c (`Echo id); 133 | Channel.read c >>= fun (`Echo z) -> 134 | assert (z = id); 135 | let go () = 136 | Parallel.run (fun () -> 137 | Channel.write c (`Echo ("sub" ^ id)); 138 | Channel.read c >>= (fun (`Echo z) -> 139 | assert (z = "sub" ^ id); 140 | Deferred.unit)) 141 | >>| (function 142 | | Error e -> failwithf "worker within worker died %s" e () 143 | | Ok () -> ()) 144 | in 145 | Deferred.all_unit [go (); go (); go (); go ()]) 146 | >>> (function 147 | | Error e -> failwithf "client died with exception \"%s\"" e () 148 | | Ok () -> loop (i + 1)) 149 | end 150 | in 151 | loop 0) 152 | >>= fun () -> 153 | Channel.write c `Die; 154 | res >>| function 155 | | Error e -> failwithf "echo server died with exception %s" e () 156 | | Ok () -> () 157 | in 158 | main () 159 | ;; 160 | 161 | (* We can't test remote machines within the test framework because qtest.exe doesn't 162 | support being called in that way, so there is a separate binary, test_remote.exe, for 163 | testing remote machine functionality. *) 164 | let remote () = 165 | Unix.readlink (sprintf "/proc/%d/exe" (Pid.to_int (Unix.getpid ()))) 166 | >>= fun our_bin -> 167 | let our_dir = Filename.dirname our_bin in 168 | Unix.system_exn (sprintf "%s/test_remote.exe" our_dir) 169 | 170 | let init () = 171 | (* Before calling [Parallel.create], we force creation of the async scheduler, so 172 | we can test its ability to reset in the forked master process. *) 173 | ignore (Lazy.force Writer.stdout); 174 | assert (not (Scheduler.is_ready_to_initialize ())); 175 | Parallel.init ~fail_if_async_has_been_initialized:false (); 176 | Deferred.unit 177 | ;; 178 | 179 | let shutdown () = Parallel.shutdown () >>| Or_error.ok_exn 180 | 181 | let tests = 182 | [ "Parallel_test", (fun () -> 183 | init () >>= fun () -> 184 | Deferred.all_unit [ 185 | wide (); 186 | deep (); 187 | fib (); 188 | ring (); 189 | echo_server (); 190 | (* remote (); *) 191 | ] >>= fun () -> 192 | shutdown ()) 193 | ] 194 | ;; 195 | -------------------------------------------------------------------------------- /test/qtest.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | let tests = 4 | [] 5 | @ Parallel_test.tests 6 | ;; 7 | 8 | let () = Qtest_lib.Std.Runner.main tests 9 | -------------------------------------------------------------------------------- /test/test_remote.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Async 3 | open Async_parallel_deprecated.Std 4 | 5 | let wide () = 6 | Deferred.List.iter [ 1; 2; 10; 100 ] ~f:(fun num_children -> 7 | Deferred.all 8 | (List.init num_children ~f:(fun i -> 9 | Parallel.run ~where:Parallel.random (fun () -> return i) 10 | >>| function 11 | | Error e -> failwith e 12 | | Ok i -> i)) 13 | >>| fun l -> 14 | assert (l = List.init num_children ~f:Fn.id)) 15 | ;; 16 | 17 | let fib () = 18 | let rec fib n = 19 | Parallel.run ~where:Parallel.round_robin (fun () -> 20 | match n with 21 | | 0 -> return 0 22 | | 1 -> return 1 23 | | n -> 24 | fib (n - 1) 25 | >>= fun f1 -> 26 | fib (n - 2) 27 | >>= fun f2 -> 28 | return (f1 + f2)) 29 | >>| function 30 | | Error e -> failwith e 31 | | Ok n -> n 32 | in 33 | fib 10 34 | >>| fun n -> 35 | assert (n = 55) 36 | ;; 37 | 38 | let main () = 39 | Parallel.init ~cluster: 40 | {Cluster.master_machine = Unix.gethostname (); 41 | worker_machines = ["localhost"]} (); 42 | Deferred.all_unit [ wide (); fib () ] >>> fun () -> 43 | Parallel.shutdown () >>> fun _ -> Shutdown.shutdown 0 44 | ;; 45 | 46 | let () = 47 | main (); 48 | never_returns (Scheduler.go ()) 49 | --------------------------------------------------------------------------------