├── .gitignore
├── Makefile
├── README.md
├── include
    └── nkcluster.hrl
├── rebar
├── rebar.config
├── src
    ├── nkcluster.app.src
    ├── nkcluster.erl
    ├── nkcluster_agent.erl
    ├── nkcluster_app.erl
    ├── nkcluster_job_class.erl
    ├── nkcluster_jobs.erl
    ├── nkcluster_jobs_core.erl
    ├── nkcluster_node_proxy.erl
    ├── nkcluster_nodes.erl
    ├── nkcluster_nodes.erl.1
    ├── nkcluster_protocol.erl
    ├── nkcluster_sup.erl
    └── nkcluster_syntax.erl
├── test
    ├── app.config
    ├── basic_test.erl
    ├── cluster_test.erl
    ├── task_test.erl
    ├── test_job_class.erl
    └── vm.args
└── util
    ├── dev1.config
    ├── dev2.config
    ├── dev3.config
    ├── dev4.config
    ├── dev5.config
    ├── dev_vm.args
    ├── riak_core.schema
    ├── shell_app.config
    └── shell_vm.args


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | deps
3 | dev
4 | ebin
5 | log
6 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | REPO ?= nkcluster
 2 | #RELOADER ?= -s nkreloader
 3 | 
 4 | .PHONY: deps release dev
 5 | 
 6 | all: deps compile
 7 | 
 8 | compile:
 9 | 	./rebar compile
10 | 
11 | cnodeps:
12 | 	./rebar compile skip_deps=true
13 | 
14 | deps:
15 | 	./rebar get-deps
16 | 	find deps -name "rebar.config" | xargs perl -pi -e 's/lager, "2.0.3"/lager, ".*"/g'
17 | 	(cd deps/lager && git checkout 2.1.1)
18 | 
19 | clean: 
20 | 	./rebar clean
21 | 
22 | distclean: clean
23 | 	./rebar delete-deps
24 | 
25 | tests: compile eunit
26 | 
27 | eunit:
28 | 	export ERL_FLAGS="-config test/app.config -args_file test/vm.args"; \
29 | 	./rebar eunit skip_deps=true
30 | 
31 | shell:
32 | 	erl -config util/shell_app.config -args_file util/shell_vm.args -s nkcluster_app $(RELOADER)
33 | 
34 | shell-test:
35 | 	erl -config test/app.config -args_file test/vm.args -s nkcluster_app $(RELOADER)
36 | 
37 | docs:
38 | 	./rebar skip_deps=true doc
39 | 
40 | 
41 | dev1:
42 | 	erl -config util/dev1.config -args_file util/dev_vm.args \
43 | 		-name dev1@127.0.0.1 -s nkcluster_app $(RELOADER)
44 | 
45 | dev2:
46 | 	erl -config util/dev2.config -args_file util/dev_vm.args \
47 | 	    -name dev2@127.0.0.1 -s nkcluster_app $(RELOADER)
48 | 
49 | dev3:
50 | 	erl -config util/dev3.config -args_file util/dev_vm.args \
51 | 	    -name dev3@127.0.0.1 -s nkcluster_app $(RELOADER)
52 | 
53 | dev4:
54 | 	erl -config util/dev4.config -args_file util/dev_vm.args \
55 | 	    -name dev4@127.0.0.1 -s nkcluster_app $(RELOADER)
56 | 
57 | dev5:
58 | 	erl -config util/dev5.config -args_file util/dev_vm.args \
59 | 	    -name dev5@127.0.0.1 -s nkcluster_app $(RELOADER)
60 | 
61 | dev:
62 | 	erl -config test/app.config -args_file test/vm.args \
63 | 	    -s nkcluster_app $(RELOADER)
64 | 
65 | 
66 | 
67 | APPS = kernel stdlib sasl erts ssl tools os_mon runtime_tools crypto inets \
68 | 	xmerl webtool snmp public_key mnesia eunit syntax_tools compiler
69 | COMBO_PLT = $(HOME)/.$(REPO)_combo_dialyzer_plt
70 | 
71 | check_plt: 
72 | 	dialyzer --check_plt --plt $(COMBO_PLT) --apps $(APPS) deps/*/ebin
73 | 
74 | build_plt: 
75 | 	dialyzer --build_plt --output_plt $(COMBO_PLT) --apps $(APPS) deps/nk*/ebin
76 | 
77 | dialyzer:
78 | 	dialyzer -Wno_return --plt $(COMBO_PLT) ebin/nkcluster*.beam #| \
79 | 	    # fgrep -v -f ./dialyzer.ignore-warnings
80 | 
81 | cleanplt:
82 | 	@echo 
83 | 	@echo "Are you sure?  It takes about 1/2 hour to re-build."
84 | 	@echo Deleting $(COMBO_PLT) in 5 seconds.
85 | 	@echo 
86 | 	sleep 5
87 | 	rm $(COMBO_PLT)
88 | 
89 | 
90 | build_tests:
91 | 	erlc -pa ebin -pa deps/lager/ebin -o ebin -I include -pa deps/nklib \
92 | 	+export_all +debug_info +"{parse_transform, lager_transform}" \
93 | 	test/*.erl
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | * [Intoduction](#introduction)
  2 | * [Quick Start](#quick-start)
  3 | * [Use cases and deployment scenarios](#use-cases-and-deployment-scenarios)
  4 | * [Startup and Discovery](#startup-and-discovery)
  5 | * [Operation](#operation)
  6 | * [Requests, Tasks and Job Classes](#requests-tasks-and-job-classes)
  7 | * [Failure scenarios](#failure-scenarios)
  8 | * [Configuration](#configuration)
  9 | 
 10 | 
 11 | # Introduction
 12 | 
 13 | NkCLUSTER is a framework for creating clusters of Erlang nodes of any size, and distributing and managing jobs into them. It uses its own cluster management solution, based on [_NkDIST_](https://github.com/nekso/nkdist), [_riak_core_](https://github.com/basho/riak_core) and a [_custom distribution protocol_](src/nkcluster_protocol.erl). NkCLUSTER is one of the core pieces of the upcoming [_NetComposer_](http://www.slideshare.net/carlosjgf/net-composer-v2) platform, but can be used on its own.
 14 | 
 15 | Standard Erlang clusters are very convenient and easy to use, but they have some important limitations:
 16 | * Since each Erlang node must open a connection to any other node in the cluster, it is usually not practical to scale beyond about 50-100 nodes (_hidden nodes_ are a possible workaround).
 17 | * Limited transport options, only TCP is (easily) available. Not very firewall friendly.
 18 | * Sending large messages can affect the latency of other, small messages.
 19 | * Adding and removing nodes does not redistribute the load on the nodes.
 20 | * In practical terms, all nodes must usually belong to the same LAN.
 21 | 
 22 | NkCLUSTER allows the creation of clusters with a very large number of nodes. It uses a _hybrid_ approach. Any node in the cluster can have two different roles:
 23 | 
 24 | * _Control_ role: They receive, share and process requests from the network, and manage and send jobs to nodes with 'worker' role.
 25 | * _Worker_ role: They receive job requests from any node with _control_ role and execute them, and can also receive specialized network traffic to process. They can have also special jobs like being a network router or a disk server.
 26 | 
 27 |  All of the nodes of the cluster run the same Erlang application (_nkcluster_), but the first 'N' nodes in the cluster are _primary_ nodes (they have _control_ and _worker_ roles), and the rest of the nodes are _secondary_ nodes (they only have the _worker_ role). N must be power of two, typically 16, 32, 64 or 128. Primary nodes create a _riak_core_ cluster among them. The set of all primary nodes is called the primary cluster, a subset of the whole cluster.
 28 | 
 29 | Full nodes talk among them using standard Erlang distribution protocol. From the NkCLUSTER point of view, worker nodes talk only with their _node proxy_, placed at some control node, using TCP, TLS, SCTP, WS or WSS transports (they can of course talk with other worker nodes or whatever they want to using other means) . NkCLUSTER is designed to allow worker nodes to be deployed at WAN distances from control nodes (for example, at different cloud providers). They can be very _firewall-friendly_, for example using _websockets_ transports on port 80 or 443, and in some circumstances, without any opened incoming port. However, all control nodes should still belong to the same LAN.
 30 | 
 31 | NkCLUSTER uses the concepts of _requests_, _tasks_, _events_ and _job classes_. For each worker node, a _node proxy_ process is started at some specific control node, and can be used to send requests and tasks to its managed worker node. Requests are short-lived RPC calls. Tasks are long-living Erlang processes running at a worker node and managed from a control process at a control node. Events are pieces of information sent from a worker node to its node proxy. All of them must belong to a previously defined [_job_class_](src/nkcluster_job_class.erl).
 32 | 
 33 | Any node in the cluster can have a set of _labels_ or _metadata_ associated with it, with an _url-like format_. For example, `core;group=group1;master, meta2;labelA=1;labelB=2` would add two metadata items, `core` and `meta2`, each with some keys (`master`, `labelA`...) and, optionally, a value the key (`group1`, `2`...). This metadata can be used for any purpose, for example to group nodes or decide where to put an specific Task in the cluster.
 34 | 
 35 | NkCLUSTER includes some out-of-the-box tools like metadata management, hot remote loading (or updating) of Erlang code, launching OS commands, [Docker](https://www.docker.com) management, etc. NkCLUSTER allows _on the fly_ addition and removal of nodes (both control and workers) and it is designed to allow jobs to be developed so that service is not disrupted at any moment.
 36 | 
 37 | NkCLUSTER scales seamlessly, from a single machine to a 10-20 _control+worker_ nodes cluster, all the way to a _huge_ cluster, where, for example, 50-100 control nodes manage thousands or tens of thousands of worker nodes. In a future version, NkCLUSTER will allow dynamic creation of nodes in public clouds like Amazon or Azure or private ones like OpenStack.
 38 | 
 39 | NkCLUSTER requires Erlang R17+.
 40 | 
 41 | 
 42 | # Quick Start
 43 | 
 44 | ```
 45 | git clone https://github.com/Nekso/nkcluster.git
 46 | make
 47 | ```
 48 | 
 49 | Then, you can start five different consoles, and start five nodes, one at each: `make dev1`, `make dev2`, `make dev3`, `make dev4` and `make dev5`.
 50 | 
 51 | Nodes 1, 2 and 3 are control/worker nodes. Nodes 4 and 5 are worker nodes. The cluster should discover everything automatically, but you must deploy the cluster plan. At any node of 1, 2 or 3:
 52 | 
 53 | ```erlang
 54 | > nkcluster_nodes:get_nodes().
 55 | [<<"dev1">>,<<"dev2">>,<<"dev3">>,<<"dev4">>,<<"dev5">>]
 56 | 
 57 | > nkcluster_nodes:get_node_info(<<"dev4">>).
 58 | {ok, #{id=><<"dev4">>, ...}}
 59 | 
 60 | > nkcluster:call(<<"dev3">>, erlang, node, [], 5000). 
 61 | {reply, 'dev3@127.0.0.1'}
 62 | ```
 63 | 
 64 | 
 65 | # Use cases and deployment scenarios
 66 | 
 67 | NkCLUSTER is designed to solve an specific type of distributed work problem. It is probably useful for other scenarios, but it is specially well suited for the following case, where any _job class_ is a compound of up to three _layers_:
 68 | 
 69 | * A mandatory, lightweight _control_, _manager_, or _smart_ layer, written in Erlang (or any other BEAM language, like Elixir or LFE) that runs at all control nodes, so that any of them can receive a request to start or manage any task associated to this class. Optionally, a helper lightweight OS process or docker container could be started at its co-located worker node (written in any programming language).
 70 | * Optionally, a possibly heavyweight set of OS processes, docker containers or _pods_, running at specific worker nodes, and managed locally by an Erlang application sent from the control cluster _over the wire_, that talks with the controllers at the control nodes.
 71 | * Lastly, an optional, lightweight _state_ associated with each task, also written in Erlang, and running in the same control node than the _node proxy_ process for the worker node where the real job happens is located. Since node proxies are spread evenly among the cluster, your state processes will also be spread automatically.
 72 | 
 73 | NkCLUSTER includes full support for managing OS processes (using [NkLIB](https://github.com/Nekso/nklib)) and for managing Docker containers (using [NkDOCKER](https://github.com/Nekso/nkdocker)). The NkCLUSTER recommended approach is having, at the worker nodes, a local Erlang application monitoring processes and containers, and sending high-level, aggregated information to the control cluster, instead of sending raw stdout or similar low-level information. This way, even if the connection is temporarily lost, the worker node can continue working at some extend, while the connection to the control nodes is re-established.
 74 | 
 75 | This architecture is quite well suited for many real case scenarios, for example:
 76 | 
 77 | * SIP/WebRTC media processing framework, where:
 78 | 	* [NkSIP](https://github.com/kalta/nksip) and WebRTC controllers run at every control node, accepting connections and managing SIP and WebRTC signaling.
 79 | 	* each specific call or conference is an Erlang process that is evenly distributed at the control cluster.
 80 | 	* a number of [Freeswitch](https://freeswitch.org/) instances are started at worker nodes as docker containers for the heavy media management (transcoding, recording, etc.)
 81 | * Software-defined storage network based on [Ceph](http://ceph.com), where:
 82 | 	* the Ceph control daemons run in all of the control nodes.
 83 | 	* the disk control daemons run at each worker node offering disks to the network (the middle layer would not be used in this case).
 84 | * Highly available [Kubernetes](http://kubernetes.io) cluster, where:
 85 | 	* _etcd_ and _controlling_ processes run at the control nodes.
 86 | 	* the _real work_ docker and controllers run at each specific worker node.
 87 | * Parallel or _streaming_ processing, where:
 88 | 	* the control nodes receive streaming information to process, and decide which worker node must do the real processing. 	* For each task, an equivalent Erlang process is started at the control cluster.
 89 | 	* The selected worker node receives the piece of information and process it, sending the response back.
 90 | * IoT platform, where:
 91 | 	* the remote devices discover, talking with the control cluster, their associated worker node.
 92 | 	* the device connects directly to their assigned worker node. Each worker node can handle a big number of devices. In case of failure, they find a new node and try to re-connect to it.
 93 | 
 94 | 
 95 | Depending on the configuration, several possible scenarios are supported:
 96 | 
 97 | * _Standard_. Each node listens on a defined port, using any supported transport. Then, you configure all nodes with the addresses of the control cluster (by hand or using NAPTR, SRV or multiple-DNS resolution):
 98 | 	* The first node is started as a _control+worker_ node.
 99 | 	* Zero or more nodes are started as _control+worker nodes_. They automatically discover other control nodes, connect to them and form a _riak_core_ cluster.
100 | 	* Zero or more nodes are started as _workers_ only, and, using configuration or DNS, they discover and connect to a random control node. The receiving control node decides the _final_ location for the control process (using riak_core distribution) and the correspoding _node proxy_ process starts there.
101 | 	* Each node proxy process starts a connection to its controlled worker node. In case of failure, or if a specific request asks for an exclusive connection (for example for sending big files), a new connection is started.
102 | * _No-discovery_. If you don't offer a `cluster_addr` parameter, no discovery should occur. Control nodes must be joined manually, and connections to worker nodes must be explicitly started.
103 | * _No-listening_. Worker nodes can work without a listening address (for example, behind a firewall not accepting any incoming connection). In this case, the discovery connections are reused from the worker nodes. However, in case of connection failure, control nodes must wait for the next discovery before reconnecting to the nodes.
104 | 
105 | 
106 | # Startup and Discovery
107 | 
108 | The same Erlang application (`nkcluster`) is used at control (_control+worker_ nodes actually) nodes and worker-only nodes. When configuring a new node you must supply:
109 | 
110 | * A cluster name.
111 | * A set of listening network points, using _[NkPACKET](https://github.com/nekso/nkpacket) url-like format_, for example `nkcluster://localhost;transport=tcp, nkcluster://localhost;transport=wss`
112 | * A password.
113 | * If the node is going to be a _control_ node (besides being a _worker_ node) or not.
114 | * Metadata associated with the node.
115 | * Optionally, one or several network addresses to discover control nodes.
116 | 
117 | If a set of _cluster discovery addresses_ are configured, the node will extract from them all transports, IP addresses and ports, it will randomize the list and try to connect to each one, until one accepts the connection. You should include all planned control nodes, DNS addresses returning several addresses, etc, for example:
118 | 
119 | ```erlang
120 | {cluster_addr, "nkcluster://my_domain, nkcluster://10.0.0.1:1234;transport=wss"}
121 | ```
122 | 
123 | It this example, the node will extract all IP addresses and ports from `my_domain` (using DNS), and will add `10.0.0.1:1234` using `wss` transport to the list. It will then randomize the list and start trying to connect to each one in turn.
124 | 
125 | The password for this specific node and tls options can be included in the url:
126 | ```erlang
127 | {cluster_addr, "nkcluster://10.0.0.1:1234;transport=wss;password=pass1;tls_depth=2"}
128 | ```
129 | 
130 | NkPACKET supports the following DNS records for discovery:
131 | * [_NAPTR_](https://en.wikipedia.org/wiki/NAPTR_record) records. If you don't supply port or transport, it will try to find a NAPTR record for the domain. For example, with this record:
132 | 
133 | 	```
134 | example.com NAPTR 10 100 "S" "NKS+D2W" "" _nks._ws.example.com.
135 | example.com NAPTR 10 200 "S" "NKS+D2T" "" _nks._tcp.example.com. 
136 | example.com NAPTR 10 300 "S" "NK+D2W" "" _nk._ws.example.com.
137 | example.com NAPTR 10 400 "S" "NK+D2T" "" _nk._tcp.example.com.
138 | example.com NAPTR 10 500 "S" "NK+D2S" "" _nk._sctp.example.com.
139 | 	```
140 | NkCLUSTER will first try _WSS_ transport (resolving `_nks._ws.example.com` as a _SRV_ domain find IPs and ports), then _TLS_, then _WS_, then _TCP_ and finally _SCTP_ as a last resort.
141 | 
142 | * [_SRV_](https://en.wikipedia.org/wiki/SRV_record) records. After a _NAPTR_ respone, or if you supplied the desired transport, the corresponding _SRV_ record will be resolved. For example, if `tcp` was selected, with this record:
143 | 
144 | 	```
145 | _nk._tcp.example.com. 86400 IN SRV 0 5 1972 cluster.example.com.
146 | 	```
147 | NkCLUSER will try to resolve `cluster.example.com`, taking all IPs from there and using port `1972`.
148 | 
149 | * [_Round-robin DNS_](https://en.wikipedia.org/wiki/Round-robin_DNS). Each time NkCLUSTER must resolve an IP address, if it returns multiple `A` records, it will randomize the list.
150 | 
151 | The receiving control node will accept the connection, and both parties will authenticate each other, sending a challenge (the node _UUID_, a random string auto-generated at boot) that must be _signed_ using the local password, with [PBKDF2](https://en.wikipedia.org/wiki/PBKDF2) protocol. If everything is ok, the control node will select the _right_ node to host the node proxy process and it will start it there. If the selected node is different, the node proxy will try to start a direct connection to the worker, if possible.
152 | 
153 | Using this node proxy process, you can send requests and start tasks at the worker node. Worker nodes will also send periodic information about its state (status, cpu, memory, etc.). If the connection fails, the control process will try to set it up again. Worker nodes will also try to connect again by themselves if no control node contacts them in a while. In some cases (like a network split) a single worker node could be _connected_ to several proxy processes at different nodes, but this should be a temporary situation, resolved once the cluster converges again.
154 | 
155 | 
156 | # Operation
157 | 
158 | Based on _riak_core_, NkCLUSTER allows the addition and removal of nodes at any moment.
159 | 
160 | When a new control node is added, it automatically discovers and joins the riak_core cluster. While the cluster is reorganized, node proxy processes are relocated to be evenly distributed among the cluster again. When a node is removed, the opposite happens, also automatically.
161 | 
162 | Worker nodes can also be added and removed at any moment, and the changes are recognized automatically by the control nodes. Worker nodes can operate at the following states:
163 | 
164 | * _Launching_: The node is currently starting.
165 | * _Connecting_: The cluster is currently trying to connect to the node.
166 | * _Ready_: The node is ready to receive requests and tasks.
167 | * _Standby_: The node is ready, but it is not currently accepting any new task.
168 | * _Stopping_: The node is scheduled to stop as soon as no remaining tasks are running. It does not accept any new task. All tasks are notified to stop as soon as possible. Once all tasks have stopped, the status changes to _Stopped_ automatically.
169 | * _Stopped_: The node is alive but stopped, not accepting any new task. 
170 | * _Not Connected_: The cluster is not currently able to connect to the node.
171 | 
172 | Requests are allowed in _ready_, _standby_, _stopping_ and _stopped_ states.
173 | 
174 | 
175 | # Requests, Tasks and Job Classes
176 | 
177 | Before sending any request or task to a worker node, you must define a _job_class_ module, implementing the [`nkcluster_job_class`](src/nkcluster_job_class.erl) behaviour. This callback module must be available a both sides (control and worker). You must implement the following callbacks:
178 | 
179 | Name|Side|Description
180 | ---|---|---
181 | request/2|Worker|Called at the worker node when a new request must be processed. Must send an immediate reply.
182 | task/2|Worker|Called at the worker node when a new task is asked to start. Must start a new process and return its `pid()` and, optionally, an immediate response. The task can send _events_ at any moment.
183 | command/3|Worker|Called at the worker when a new command is sent to an existing task. Must return an immediate reply.
184 | status/2|Worker|Called at the worker when the node status changes. If the status changes to `stopping`, the task should stop as soon as possible.
185 | event/2|Control|Called at the control node when a task at the worker side sends an event back.
186 | 
187 | You can send your own Erlang module (or modules) to the remote side, over the wire, using `nkcluster:load_module/3` or `nkcluster:load_modules/3`. 
188 | 
189 | Once defined your _job_class_ module, you can send requests, start tasks and send messages to tasks. Tasks can send events back to the job class.
190 | 
191 | You can send requests calling `nkcluster:request/3,4`. The callback `request/2` will be called at the remote (worker) side, and your call will block until a response is sent back. You can define a _timeout_, and also ask NkCLUSTER to start a new, exclusive connection to the worker if possible. If the connection or the node fails, an error will be returned. If you are not asking for a new, exclusive connection, you request processing time must be very short (< 100 msecs). Otherwise, the periodic ping will be delayed and the connection may be dropped.
192 | 
193 | For long-running jobs, you must start a new task, calling `nkcluster:task/3,4`. The callback `task/2` will be called at the remote side, and it must start a new Erlang process and return its `pid()` and, optionally, a reply. A _job_id_ is returned to the caller, along with the reply if sent. You can send mesages to any started task calling `nkcluster_jobs:command/4,5`.
194 | 
195 | The started task can send _events_ back to the control node at any moment, calling `nkcluster_jobs:send_event/2`. The event will arrive at the control node, and the callback `event/2` will be called for the corresponding job class. NkCLUSTER will also send some automatic events:
196 | 
197 | Event|Description
198 | ---|---
199 | {nkcluster, {task_started, TaskId}}|The task has successfully started.
200 | {nkcluster, {task_stopped, TaskId, Reason}}|The task has stopped (meaning the Erlang process has stopped).
201 | {nkcluster, {node_status, nkcluster:node_status()}}|The node status have changed. If it changes to `not_connected` some event may have been lost.
202 | 
203 |  
204 | ## Built-in requests
205 | 
206 | NkCLUSTER offers some standard utility requests out of the box, available in the [`nkcluster`](src/nkcluster.erl) module:
207 | 
208 | Name|Desc
209 | ---|---
210 | get_info/0|Gets current information about the node, including its status
211 | set_status/1|Changes the status of the node
212 | get_tasks/1|Gets all tasks running at a worker node
213 | get_tasks/2|Gets all tasks running at a worker node and belonging to a job class
214 | get_meta/1|Gets the current metadata of a remote node
215 | update_meta/2|Updates the metadata at a remote node
216 | get_data/2|Stored a piece of data at the remote registry
217 | put_data/3|Updates a piece of data at the remote registry
218 | call/5|Calls an Erlang function at a remote node
219 | spawn_call/5|Calls an Erlang function at a remote node, not blocking the channel
220 | send_file/3|Sends a file to a remote node. For files > 4KB, it switches to send_bigfile/3.
221 | send_bigfile/3|Sends a file to a remote node, starting a new connection (if possible), in 1MByte chunks
222 | load_module/2|Sends a currently loaded Erlang module and loads it a the remote node
223 | load_modules/2|Sends a currently loaded Erlang set of modules and loads them to the remote node
224 | 
225 | 
226 | # Failure scenarios
227 | 
228 | ## Connection failure
229 | 
230 | When the main connection of a node proxy to its managed worker node fails, the following things happen:
231 | 
232 | * An `{error, connection_failed}` error is returned to all pending requests sent over this connection and still waiting for a response. Since the connection has failed, the remote connection (and the request itself, since it is usually running in the same process) have also probably failed (you can't however assume this for sure). 
233 | * An `{nkcluster, {node_status, not_connected}}` event is sent to all classes that the control node has _seen_ (all classes that have received at last one event). Job classes must then assume that some events may have been lost, and must try to recover its state once the connection is alive again.
234 | * The control node will try to connect immediately to the remote node (only if it published one or several listening points). If it fails, it will try again periodically. Meanwhile, all new requests will fail. 
235 | * If the remote worker didn't publish any listening address, the control process exists. The remote node should try to discover nodes again periodically, and a new control process will then be started.
236 | 
237 | Requests can start a secondary, exclusive connection. In this case, the failure of the main connection does not affect them (but affect their events). If this exclusive connection is the one who fails, the request will fail in the same manner described above.
238 | 
239 | 
240 | ## Node failure
241 | 
242 | If the node proxy process fails (because of the failure of the whole control node or a bug), all pending requests will also fail (since they are using `gen_server:call/3` under the hood). The worker node will detect this and try to announce itself again. The control cluster will then start a new node proxy process at the same (if again alive) or other node.
243 | 
244 | If the worker node process fails (because of the failure of the whole worker node or a bug), the control process will enter into _not_connected_ state, notifying all detected job classes as described before. If the worker node no longer exists, you must call `nkcluster_nodes:stop(NodeId)` to remove it.
245 | 
246 | 
247 | # Configuration
248 | 
249 | NkCLUSTER uses standard Erlang application environment variables. The same Erlang application is used for agents and controllers. 
250 | 
251 | Option|Type|Default|Desc
252 | ---|---|---|---
253 | cluster_name|`term()`|`"nkcluster"`|Nodes will only connect to other nodes in the same cluster
254 | cluster_addr|`nklib:user_uri()`|`""`|List of control nodes to connect to (see above)
255 | password|`string()|binary()`|`"nkcluster"`|Password to use when connecting to or from control nodes
256 | meta|`string()|binary()`|`""`|Metadata for this node (i.e. `"class;data=1, location;dc=here"`)
257 | is_control|`boolean()`|`true`|If this node has the `control` role
258 | listen|`nklib:user_uri()`|`"nkcluster://all;transport=tls`|List of addresses, ports and transports to listen on (see above)
259 | tls_certfile|`string()`|-|Custom certificate file
260 | tls_keyfile|`string()`|-|Custom key file
261 | tls_cacertfile|`string()`|-|Custom CA certificate file
262 | tls_password|`string()`|-|Password fort the certificate
263 | tls_verify|`boolean()`|false|If we must check certificate
264 | tls_depth|`integer()`|0|TLS check depth
265 | 


--------------------------------------------------------------------------------
/include/nkcluster.hrl:
--------------------------------------------------------------------------------
 1 | %% -------------------------------------------------------------------
 2 | %%
 3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
 4 | %%
 5 | %% This file is provided to you under the Apache License,
 6 | %% Version 2.0 (the "License"); you may not use this file
 7 | %% except in compliance with the License.  You may obtain
 8 | %% a copy of the License at
 9 | %%
10 | %%   http://www.apache.org/licenses/LICENSE-2.0
11 | %%
12 | %% Unless required by applicable law or agreed to in writing,
13 | %% software distributed under the License is distributed on an
14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | %% KIND, either express or implied.  See the License for the
16 | %% specific language governing permissions and limitations
17 | %% under the License.
18 | %%
19 | %% -------------------------------------------------------------------
20 | 
21 | -ifndef(NKCLUSTER_HRL_).
22 | -define(NKCLUSTER_HRL_, 1).
23 | 
24 | %% ===================================================================
25 | %% Defines
26 | %% ===================================================================
27 | 
28 | 
29 | 
30 | %% ===================================================================
31 | %% Records
32 | %% ===================================================================
33 | 
34 | 
35 | 
36 | 
37 | -endif.
38 | 
39 | 


--------------------------------------------------------------------------------
/rebar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NetComposer/nkcluster/567174fe172ffd057a27baf5e839bc872d3bf023/rebar


--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
 1 | % {lib_dirs, ["deps"]}.
 2 | 
 3 | {erl_opts, [
 4 |     % native,
 5 |     debug_info, 
 6 |     fail_on_warning, 
 7 |     {parse_transform, lager_transform}
 8 | ]}.
 9 | 
10 | {cover_enabled, true}.
11 | {cover_export_enabled, true}.
12 | 
13 | 
14 | {deps, [
15 | 	{nkdist, ".*", {git, "https://github.com/Nekso/nkdist.git", {branch, "master"}}},
16 | 	{nkpacket, ".*", {git, "https://github.com/Nekso/nkpacket.git", {branch, "master"}}},
17 | 	{nkdocker, ".*", {git, "https://github.com/Nekso/nkdocker.git", {branch, "master"}}},
18 |     {eper, ".*", {git, "http://github.com/massemanet/eper.git", {branch, "master"}}}
19 | ]}.
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/src/nkcluster.app.src:
--------------------------------------------------------------------------------
 1 | {application, nkcluster, [
 2 |     {description, "Nekso Cluster Framework"},
 3 |     {vsn, "master"},
 4 |     {modules, []},
 5 |     {registered, []},
 6 |     {mod, {nkcluster_app, []}},
 7 |     {applications, [
 8 |         kernel,
 9 |         stdlib,
10 |         crypto,
11 |         sasl,
12 |         ssl,
13 |         lager,
14 |         nklib,
15 |         nkpacket,
16 |         os_mon
17 |     ]},
18 |     {env, []}
19 | ]}.
20 | 


--------------------------------------------------------------------------------
/src/nkcluster.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc NkCLUSTER User Functions
 22 | -module(nkcluster).
 23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 24 | 
 25 | -export([get_info/1, set_status/2]).
 26 | -export([get_tasks/1, get_tasks/2, get_meta/1, update_meta/2, remove_meta/2]).
 27 | -export([get_data/2, put_data/3, del_data/2, call/5, spawn_call/5]).
 28 | -export([send_file/3, send_bigfile/3, load_modules/2, load_module/2]).
 29 | -export([request/3, request/4, task/3, task/4, command/4, command/5]).
 30 | 
 31 | -export_type([node_id/0, conn_id/0, job_class/0, task_id/0]).
 32 | -export_type([request/0, task/0, command/0, reply/0, event/0]).
 33 | -export_type([node_status/0]).
 34 | 
 35 | -define(CLASS, nkcluster_jobs_core).
 36 | 
 37 | 
 38 | %% ===================================================================
 39 | %% Types
 40 | %% ===================================================================
 41 | 
 42 | 
 43 | -type node_id() :: binary().
 44 | -type conn_id() :: nkcluster_protocol:conn_id().
 45 | -type job_class() :: module().
 46 | -type request() :: term().
 47 | -type task_id() :: term().
 48 | -type task() :: term().
 49 | -type command() :: term().
 50 | -type reply() :: term().
 51 | -type event() :: term().
 52 | -type node_status() :: launching | connecting | ready | standby | stopping | stopped | 
 53 |                        not_connected.
 54 | 
 55 | -type conn_spec() :: node_id() | pid().
 56 | -type req_opts() :: nkcluster_node_proxy:rpc_opts().
 57 | 
 58 | 
 59 | 
 60 | %% ===================================================================
 61 | %% Public
 62 | %% ===================================================================
 63 | 
 64 | 
 65 | %% @doc Gets node info (from master cotroller)
 66 | -spec get_info(node_id()) ->
 67 |     {ok, nkcluster_nodes:info()} | {error, term()}.
 68 | 
 69 | get_info(Node) ->
 70 |     nkcluster_nodes:get_node_info(Node).
 71 | 
 72 | 
 73 | %% @doc Updates remote status
 74 | -spec set_status(conn_spec(), ready|standby|stopped) ->
 75 |     ok | {error, term()}.
 76 | 
 77 | set_status(Node, Status) when Status==ready; Status==standby; Status==stopped ->
 78 |     request2(Node, ?CLASS, {set_status, Status}).
 79 | 
 80 | 
 81 | %% @doc Gets all started jobs
 82 | -spec get_tasks(conn_spec()) ->
 83 |     {ok, [{job_class(), task_id(), pid()}]} | 
 84 |     {error, term()}.
 85 | 
 86 | get_tasks(Node) ->
 87 |     request2(Node, ?CLASS, get_tasks).
 88 | 
 89 | 
 90 | %% @doc Gets all started jobs
 91 | -spec get_tasks(conn_spec(), job_class()) ->
 92 |     {ok, [{task_id(), pid()}]} | 
 93 |     {error, term()}.
 94 | 
 95 | get_tasks(Node, Class) ->
 96 |     request2(Node, ?CLASS, {get_tasks, Class}).
 97 | 
 98 | 
 99 | %% @doc Gets all registered metadatas
100 | -spec get_meta(conn_spec()) ->
101 |     {ok, [nklib:token()]} | {error, term()}.
102 | 
103 | get_meta(Node) ->
104 |     request2(Node, ?CLASS, get_meta).
105 | 
106 | 
107 | %% @doc Registers a new metadata
108 | -spec update_meta(conn_spec(), nklib:token()) ->
109 |     {ok, [nklib:token()]} | {error, term()}.
110 | 
111 | update_meta(Node, Tokens) ->
112 |     case nklib_parse:tokens(Tokens) of
113 |         error -> 
114 |             {error, invalid_tokens};
115 |         Parsed -> 
116 |             request2(Node, ?CLASS, {update_meta, Parsed})
117 |     end.
118 | 
119 | 
120 | %% @doc Removes some metadata
121 | -spec remove_meta(conn_spec(), nklib:token()) ->
122 |     {ok, [nklib:token()]} | {error, term()}.
123 | 
124 | remove_meta(Node, Tokens) ->
125 |     case nklib_parse:tokens(Tokens) of
126 |         error -> 
127 |             {error, invalid_tokens};
128 |         Parsed -> 
129 |             Keys = [Key || {Key, _} <- Parsed],
130 |             request2(Node, ?CLASS, {remove_meta, Keys})
131 |     end.
132 | 
133 | 
134 | 
135 | %% @doc Gets remotely stored data
136 | -spec get_data(conn_spec(), term()) ->
137 |     {ok, term()} | {error, term()}.
138 | 
139 | get_data(Node, Key) ->
140 |     request2(Node, ?CLASS, {get_data, Key}).
141 | 
142 | 
143 | %% @doc Stores data remotely
144 | -spec put_data(conn_spec(), term(), term()) ->
145 |     ok | {error, term()}.
146 | 
147 | put_data(Node, Key, Val) ->
148 |     request2(Node, ?CLASS, {put_data, Key, Val}).
149 | 
150 | 
151 | %% @doc Removes data remotely
152 | -spec del_data(conn_spec(), term()) ->
153 |     ok | {error, term()}.
154 | 
155 | del_data(Node, Key) ->
156 |     request2(Node, ?CLASS, {del_data, Key}).
157 | 
158 | 
159 | %% @doc Calls a remote erlang functiond
160 | -spec call(conn_spec(), atom(), atom(), list(), integer()) ->
161 |     {reply, term()} | {error, term()}.
162 | 
163 | call(Node, Mod, Fun, Args, Timeout) ->
164 |     request(Node, ?CLASS, {call, Mod, Fun, Args}, #{timeout=>Timeout}).
165 | 
166 | 
167 | %% @doc Calls a remote erlang function, in a spwaned process
168 | -spec spawn_call(conn_spec(), atom(), atom(), list(), integer()) ->
169 |     {reply, term()} | {error, term()}.
170 | 
171 | spawn_call(Node, Mod, Fun, Args, Timeout) ->
172 |     request(Node, ?CLASS, {spawn_call, Mod, Fun, Args}, #{timeout=>Timeout}).
173 | 
174 | 
175 | %% @doc Sends a file.
176 | %% If the size is bellow 4KB, it will be sent synchronously. If it is bigger,
177 | %% send_bigfile/3 will be used.
178 | %% If the remote directory does not exist, it will be created.
179 | -spec send_file(conn_spec(), list(), list()) ->
180 |     ok | {error, term()}.
181 | 
182 | send_file(Node, Path, RemotePath) ->
183 |     Path1 = nklib_util:to_list(Path),
184 |     case filelib:is_regular(Path1) of
185 |         true ->
186 |             case filelib:file_size(Path1) =< 4096 of
187 |                 true ->
188 |                     case file:read_file(Path1) of
189 |                         {ok, Bin} ->
190 |                             Opts = #{timeout=>30000},
191 |                             Msg = {write_file, nklib_util:to_binary(RemotePath), Bin},
192 |                             request2(Node, ?CLASS, Msg, Opts);
193 |                         {error, Error} ->
194 |                             {error, Error}
195 |                     end;
196 |                 false ->
197 |                     send_bigfile(Node, Path, RemotePath)
198 |             end;
199 |         false ->
200 |             {error, invalid_file}
201 |     end.
202 | 
203 | 
204 | %% @doc Sends a big file, in 1MB chunks.
205 | %% If the remote directory does not exist, it will be created.
206 | -spec send_bigfile(conn_spec(), list(), list()) ->
207 |     ok | {error, term()}.
208 | 
209 | send_bigfile(Node, Path, RemotePath) ->
210 |     Path1 = nklib_util:to_list(Path),
211 |     case filelib:is_regular(Path1) of
212 |         true ->
213 |             try
214 |                 Device = case file:open(Path1, [read, binary]) of
215 |                     {ok, Device0} -> Device0;
216 |                     {error, FileError} -> throw(FileError)
217 |                 end,
218 |                 ConnPid = case nkcluster_nodes:new_connection(Node) of
219 |                     {ok, ConnPid0} -> ConnPid0;
220 |                     {error, ConnError} -> throw(ConnError)
221 |                 end,
222 |                 Opts = #{conn_pid=>ConnPid, timeout=>30000},
223 |                 Msg = {write_file, nklib_util:to_binary(RemotePath)},
224 |                 case task(Node, ?CLASS, Msg, Opts) of
225 |                     {ok, TaskId} ->
226 |                         do_send_bigfile(Node, Device, TaskId, Opts);
227 |                     {error, CallError} ->
228 |                         {error, CallError}
229 |                 end
230 |             catch
231 |                 throw:Throw -> {error, Throw}
232 |             end;
233 |         false ->
234 |             {error, invalid_file}
235 |     end.
236 | 
237 | 
238 | %% @private
239 | -spec do_send_bigfile(conn_spec(), file:fd(), task_id(), map()) ->
240 |     {reply, ok} | {error, term()}.
241 | 
242 | do_send_bigfile(Node, Device, TaskId, Opts) ->
243 |     case file:read(Device, 1024*1024) of
244 |         {ok, Data} ->
245 |             case command(Node, ?CLASS, TaskId, {write_file, Data}, Opts) of
246 |                 {reply, ok} ->
247 |                     do_send_bigfile(Node, Device, TaskId, Opts);
248 |                 {error, Error} ->
249 |                     file:close(Device),
250 |                     {error, Error}
251 |             end;
252 |         eof ->
253 |             file:close(Device),
254 |             case command(Node, ?CLASS, TaskId, {write_file, eof}, Opts) of
255 |                 {reply, ok} -> ok;
256 |                 {error, Error} -> {error, Error}
257 |             end;
258 |         {error, Error} ->
259 |             file:close(Device),
260 |             {error, Error}
261 |     end.
262 | 
263 | 
264 | %% @doc Sends and loads a module to the remote side
265 | -spec load_module(conn_spec(), atom()) ->
266 |     ok | {error, term()}.
267 | 
268 | load_module(Node, Mod) ->
269 |     case code:get_object_code(Mod) of
270 |         {Mod, Bin, File} -> 
271 |             Opts = #{timeout=>30000},
272 |             request2(Node, ?CLASS, {load_code, [{Mod, File, Bin}]}, Opts);
273 |         error ->
274 |             {error, module_not_found}
275 |     end.
276 | 
277 | 
278 | %% @doc Sends and loads all modules belonging to an application
279 | -spec load_modules(conn_spec(), atom()) ->
280 |     ok | {error, term()}.
281 | 
282 | load_modules(Node, App) ->
283 |     application:load(App),
284 |     case application:get_key(App, modules) of
285 |         {ok, Modules} ->
286 |             Data = lists:foldl(
287 |                 fun(Mod, Acc) ->
288 |                     case code:get_object_code(Mod) of
289 |                         {Mod, Bin, File} -> [{Mod, File, Bin}|Acc];
290 |                         error -> Acc
291 |                     end
292 |                 end,
293 |                 [],
294 |                 Modules),
295 |             case nkcluster_nodes:new_connection(Node) of
296 |                 {ok, ConnPid} -> 
297 |                     Opts = #{timeout=>30000, conn_pid=>ConnPid},
298 |                     request2(Node, ?CLASS, {load_code, Data}, Opts);
299 |                 {error, Error} ->
300 |                     {error, Error}
301 |             end;
302 |         undefined ->
303 |             {error, app_not_found}
304 |     end.
305 | 
306 | 
307 | %% ===================================================================
308 | %% Generic
309 | %% ===================================================================
310 | 
311 | 
312 | %% @doc Equivalent to request(Node, Class, Cmd, #{})
313 | -spec request(conn_spec(), job_class(), request()) ->
314 |     {reply, reply()} | {error, term()}.
315 | 
316 | request(Node, Class, Cmd) ->
317 |     request(Node, Class, Cmd, #{}).
318 | 
319 | 
320 | %% @doc Sends a remote synchrnous call, and wait for the result
321 | -spec request(conn_spec(), job_class(), request(), req_opts()) ->
322 |     {reply, reply()} | {error, term()}.
323 | 
324 | request(Node, Class, Cmd, Opts) ->
325 |     nkcluster_nodes:rpc(Node, {req, Class, Cmd}, Opts).
326 | 
327 | 
328 | %% @doc Equivalent to request2(Node, Class, Cmd, #{})
329 | -spec request2(conn_spec(), job_class(), request()) ->
330 |     ok | {ok, reply()} | {error, term()}.
331 | 
332 | request2(Node, Class, Cmd) ->
333 |     request2(Node, Class, Cmd, #{}).
334 | 
335 | 
336 | %% @doc Sends a remote synchrnous call, and wait for the result
337 | -spec request2(conn_spec(), job_class(), request(), req_opts()) ->
338 |     ok | {ok, reply()} | {error, term()}.
339 | 
340 | request2(Node, Class, Cmd, Opts) ->
341 |     case request(Node, Class, Cmd, Opts) of
342 |         {reply, ok} -> ok;
343 |         {reply, Reply} -> {ok, Reply};
344 |         {error, Error} -> {error, Error}
345 |     end.
346 | 
347 | 
348 | %% @doc Equivalent to task(Node, Class, Cmd, #{})
349 | -spec task(conn_spec(), job_class(), task()) ->
350 |     {ok, task_id()} | {reply, reply(), task_id()} | 
351 |     {error, term()}.
352 | 
353 | task(Node, Class, Spec) ->
354 |     task(Node, Class, Spec, #{}).
355 | 
356 | 
357 | %% @doc Starts a new remote job. 
358 | -spec task(conn_spec(), job_class(), task(), req_opts()) ->
359 |     {ok, task_id()} | {reply, reply(), task_id()} | 
360 |     {error, term()}.
361 | 
362 | task(Node, Class, Spec, Opts) ->
363 |     TaskId = case Opts of
364 |         #{task_id:=TaskId0} -> TaskId0;
365 |         _ -> nklib_util:luid()
366 |     end, 
367 |     case nkcluster_nodes:rpc(Node, {tsk, Class, TaskId, Spec}, Opts) of
368 |         {reply, ok} ->
369 |             {ok, TaskId};
370 |         {reply, {ok, Reply}} ->
371 |             {reply, Reply, TaskId};
372 |         {error, Error} ->
373 |             {error, Error}
374 |     end.
375 | 
376 | 
377 | %% @doc Equivalent to command(Node, JobClass, TaskId, Cmd, #{})
378 | -spec command(conn_spec(), job_class(), task_id(), command()) ->
379 |     {ok, reply()} | {error, term()}.
380 | 
381 | command(Node, JobClass, TaskId, Cmd) ->
382 |     command(Node, JobClass, TaskId, Cmd, #{}).
383 | 
384 | 
385 | %% @doc Sends a message to a remote job
386 | -spec command(conn_spec(), job_class(), task_id(), command(), req_opts()) ->
387 |     {reply, reply()} | {error, term()}.
388 | 
389 | command(Node, JobClass, TaskId, Cmd, Opts) ->
390 |     nkcluster_nodes:rpc(Node, {cmd, JobClass, TaskId, Cmd}, Opts).
391 | 
392 | 
393 | 
394 | 


--------------------------------------------------------------------------------
/src/nkcluster_agent.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc NkCLUSTER Agent Management
 22 | %%
 23 | %% For primary nodes:
 24 | %% - it tries to connect to other erlang nodes (pinging them)
 25 | %%
 26 | %% For all:
 27 | %% - gets periodic statistics, and tries to send an update for each one
 28 | %%     - if it cannot (because of no primary connection) it will try to 
 29 | %%       send and announce. The announce will reach nkcluster_nodes, and will
 30 | %%       start a new proxy if none is up
 31 | %%     - if it cannot (because of no connection) it will try to start a connection
 32 | %% - waits for pings from the control, if timeout it will try to connect
 33 | %%
 34 | -module(nkcluster_agent).
 35 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 36 | -behaviour(gen_server).
 37 | 
 38 | -export([node_id/0, is_primary/0]).
 39 | -export([get_status/0, set_status/1, update_cluster_addr/2, connect/2]).
 40 | -export([connect_opts/3, ping_all_nodes/0, join_nodes/1, received_ping/0]).
 41 | -export([clear_cluster_addr/0]).
 42 | -export([start_link/0, init/1, terminate/2, code_change/3, handle_call/3,   
 43 |          handle_cast/2, handle_info/2]).
 44 | 
 45 | -include_lib("nklib/include/nklib.hrl").
 46 | -include_lib("nkpacket/include/nkpacket.hrl").
 47 | 
 48 | 
 49 | -type connect_opts() ::
 50 |     #{
 51 |         password => binary(),
 52 |         ?TLS_TYPES
 53 |     }.
 54 | 
 55 | 
 56 | %% ===================================================================
 57 | %% Public
 58 | %% ===================================================================
 59 | 
 60 | 
 61 | %% @doc Gets current node ID
 62 | -spec node_id() ->
 63 |     nkcluster:node_id().
 64 | 
 65 | node_id() ->
 66 |     nkcluster_app:get(node_id).
 67 | 
 68 | 
 69 | %% @doc Finds if this node is a control node
 70 | -spec is_primary() ->
 71 |     boolean().
 72 | 
 73 | is_primary() ->
 74 |     nkcluster_app:get(type) == primary.
 75 | 
 76 | 
 77 | %% @doc Gets current node status
 78 | -spec get_status() ->
 79 |     nkcluster:node_status().
 80 | 
 81 | get_status() ->
 82 |     [{Status, _}|_] = nklib_proc:values(?MODULE),
 83 |     Status.
 84 | 
 85 | 
 86 | %% @doc Sets current node status
 87 | -spec set_status(ready|standby|stopped) ->
 88 |     ok | {error, term()}.
 89 | 
 90 | set_status(Status) when Status==ready; Status==standby; Status==stopped ->
 91 |     gen_server:call(?MODULE, {set_status, Status}).
 92 | 
 93 | 
 94 | %% @doc Update the announce list
 95 | -spec update_cluster_addr(boolean(), [nklib:user_uri()]) ->
 96 |     ok | {error, term()}.
 97 | 
 98 | update_cluster_addr(IsPreferred, ClusterAddr) ->
 99 |     case resolve(ClusterAddr, #{}) of
100 |         {ok, Conns} ->
101 |             gen_server:cast(?MODULE, {update_addrs, IsPreferred, Conns});
102 |         {error, Error} ->
103 |             {error, Error}
104 |     end.
105 | 
106 | 
107 | %% @private Avoids to reconnect
108 | -spec clear_cluster_addr() ->
109 |     ok.
110 | 
111 | clear_cluster_addr() ->
112 |     gen_server:cast(?MODULE, {update_addrs, true, []}),
113 |     gen_server:cast(?MODULE, {update_addrs, false, []}).
114 | 
115 | 
116 | %% @private Connects to remote node and gets info
117 | -spec connect(nklib:user_uri(), connect_opts()) ->
118 |     {ok, nkcluster:node_id(), map()}.
119 | 
120 | connect(NodeAddrs, Opts) ->
121 |     case resolve(NodeAddrs, Opts) of
122 |         {ok, Conns} ->
123 |             case do_connect(control, self(), Conns) of
124 |                 {ok, Pid, NodeId, Info} ->
125 |                     {ok, NodeId, Info#{conn_pid=>Pid}};
126 |                 {error, Error} ->
127 |                     {error, Error}
128 |             end;
129 |         {error, Error} ->
130 |             {error, Error}
131 |     end.
132 | 
133 | 
134 | %% @private Pings all known nodes in the cluster_addr (primary nodes, to join them)
135 | %% When we connect to a primary node, it will send its list of known primary nodes,
136 | %% and the protocol will call join_nodes/1
137 | %% The password must be for all the local one or be in each URL
138 | -spec ping_all_nodes() ->
139 |     ok | {error, term()}.
140 | 
141 | ping_all_nodes() ->
142 |     ClusterAddr = nkcluster_app:get(cluster_addr),
143 |     case resolve(ClusterAddr, #{}) of
144 |         {ok, Conns} ->
145 |             lists:foreach(
146 |                 fun(Conn) ->
147 |                     case do_connect(control, self(), [Conn]) of
148 |                         {ok, Pid, NodeId, _Info} ->
149 |                             lager:info("NkCLUSTER agent pinged ~s", [NodeId]),
150 |                             nkcluster_protocol:stop(Pid);
151 |                         {error, Error} ->
152 |                             lager:info("NkCLUSTER agent could not ping ~p: ~p", 
153 |                                        [Conn, Error])
154 |                     end
155 |                 end,
156 |                 Conns);
157 |         {error, Error} ->
158 |             {error, Error}
159 |     end.
160 | 
161 | 
162 | %% @private Called from nkcluster_protocol when it is a connected to a remote 
163 | %% primary node and it returns its list of known primary nodes
164 | -spec join_nodes([node()]) ->
165 |     ok.
166 | 
167 | join_nodes(Nodes) ->
168 |     case is_primary() of
169 |         true -> 
170 |             Fun = case nkcluster_app:get(staged_joins) of
171 |                 true -> staged_join;
172 |                 false -> join
173 |             end,
174 |             lists:foreach(
175 |                 fun(Node) ->
176 |                     case apply(riak_core, Fun, [Node]) of
177 |                         ok -> 
178 |                             lager:notice("NkCLUSTER Primary Node joined ~p", [Node]);
179 |                         {error, Error} ->
180 |                             lager:notice("NkCLUSTER Primary Node could not join ~p: ~p", 
181 |                                          [Node, Error])
182 |                     end
183 |                 end,
184 |                 Nodes -- [node()|nodes()]);
185 |         false ->
186 |             ok
187 |     end.
188 | 
189 | 
190 | %% @private Called from nkcluster_protocol when we receive a ping
191 | %% We check we receive another soon after
192 | received_ping() ->
193 |     gen_server:cast(?MODULE, reset_ping_timer).
194 | 
195 | 
196 | 
197 | %% ===================================================================
198 | %% gen_server
199 | %% ===================================================================
200 | 
201 | 
202 | %% @private
203 | start_link() ->
204 |     gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
205 | 
206 | 
207 | -record(state, {
208 |     cluster_addrs = [] :: [{[nkpacket:raw_connection()], map()}],
209 |     pref_addrs = [] :: [{[nkpacket:raw_connection()], map()}],
210 |     listen = [] :: [nklib:uri()],
211 |     status :: nkcluster:node_status(),
212 |     os_type :: term(),
213 |     connecting1 :: boolean(),
214 |     stats = #{} :: map(),
215 |     ping_timer :: reference()
216 | }).
217 | 
218 | 
219 | %% @private 
220 | -spec init(term()) ->
221 |     {ok, #state{}}.
222 | 
223 | init([]) ->
224 |     ok = update_cluster_addr(false, nkcluster_app:get(cluster_addr)),
225 |     OsType = case os:type() of
226 |         {unix, Type} -> Type;
227 |         _ -> unknown
228 |     end,    
229 |     State = #state{
230 |         listen = nkcluster_app:get(listen),
231 |         status = ready,
232 |         os_type = OsType,
233 |         connecting1 = false
234 |     },
235 |     nklib_proc:put(?MODULE, ready),
236 |     case is_primary() of
237 |         true -> spawn(fun() -> ping_all_nodes() end);
238 |         false -> ok
239 |     end,
240 |     erlang:send_after(1000, self(), get_stats),
241 |     {ok, reset_ping_timer(State)}.
242 | 
243 | %% @private
244 | -spec handle_call(term(), {pid(), term()}, #state{}) ->
245 |     {reply, term(), #state{}} | {noreply, #state{}}.
246 | 
247 | handle_call(get_status, _From, #state{status=Status}=State) ->
248 |     {reply, {ok, Status}, State};
249 | 
250 | handle_call({set_status, Status}, _From, #state{status=Status}=State) ->
251 |     {reply, ok, State};
252 | 
253 | handle_call({set_status, New}, From, #state{status=Old}=State) ->
254 |     case New of
255 |         ready when Old==standby; Old==stopping; Old==stopped ->
256 |             set_updated_status(ready, From, State);
257 |         standby when Old==ready; Old==stopping; Old==stopped ->
258 |             set_updated_status(standby, From, State);
259 |         stopped when Old==ready; Old==standby; Old==stopping ->
260 |             self() ! check_stopped,
261 |             set_updated_status(stopping, From, State);
262 |         _ ->
263 |             {reply, {error, not_allowed}, State}
264 |     end;
265 | 
266 | handle_call(get_state, _From, State) ->
267 |     {reply, State, State};
268 | 
269 | handle_call(Msg, _From, State) ->
270 |     lager:error("Module ~p received unexpected call: ~p", [?MODULE, Msg]),
271 |     {noreply, State}.
272 | 
273 | 
274 | %% @private
275 | -spec handle_cast(term(), #state{}) ->
276 |     {noreply, #state{}}.
277 | 
278 | handle_cast({update_addrs, true, Addrs}, State) ->
279 |     {noreply, State#state{pref_addrs=Addrs}};
280 | 
281 | handle_cast({update_addrs, false, Addrs}, State) ->
282 |     {noreply, State#state{cluster_addrs=Addrs}};
283 | 
284 | handle_cast({send_update, Stats}, State) ->
285 |     State1 = State#state{stats=Stats},
286 |     HasNoAddrs = has_no_addrs(State),
287 |     State2 = case send_update(State1) of
288 |         ok ->
289 |             State1;
290 |         {error, _} when HasNoAddrs ->
291 |             State1;
292 |         {error, _} ->
293 |             % Send announce to all connected primaries
294 |             case nkcluster_protocol:send_announce() of
295 |                 ok ->
296 |                     lager:info("NkCLUSTER agent sent announcement", []),
297 |                     State1;
298 |                 error ->
299 |                     connect_and_announce(State)
300 |             end
301 |     end,
302 |     Time = nkcluster_app:get(stats_time),
303 |     erlang:send_after(Time, self(), get_stats),
304 |     {noreply, State2};
305 | 
306 | handle_cast({connecting, false}, State) ->
307 |     {noreply, State#state{connecting1=false}};
308 | 
309 | handle_cast(reset_ping_timer, State) ->
310 |     {noreply, reset_ping_timer(State)};
311 | 
312 | handle_cast(Msg, State) ->
313 |     lager:error("Module ~p received unexpected cast: ~p", [?MODULE, Msg]),
314 |     {noreply, State}.
315 | 
316 | 
317 | %% @private
318 | -spec handle_info(term(), #state{}) ->
319 |     {noreply, #state{}}.
320 | 
321 | handle_info(get_stats, State) ->
322 |     spawn(fun() -> get_stats() end),
323 |     {noreply, State};
324 | 
325 | handle_info(check_stopped, #state{status=stopping}=State) ->
326 |     case nkcluster_jobs:get_tasks() of
327 |         {ok, []} ->
328 |             set_updated_status(stopped, none, State);
329 |         _ ->
330 |             erlang:send_after(1000, self(), check_stopped),
331 |             {noreply, State}
332 |     end;
333 | 
334 | handle_info(check_stopped, State) ->
335 |     {noreply, State};
336 | 
337 | handle_info(ping_timeout, #state{connecting1=false}=State) ->
338 |     State1 = case has_no_addrs(State) of
339 |         true ->
340 |             State;
341 |         false ->
342 |             lager:notice("NkCLUSTER agent ping timeout!", State),
343 |             connect_and_announce(State)
344 |     end,
345 |     {noreply, reset_ping_timer(State1)};
346 | 
347 | handle_info(ping_timeout, #state{connecting1=true}=State) ->
348 |     {noreply, reset_ping_timer(State)};
349 | 
350 | handle_info(Msg, State) ->
351 |     lager:error("Module ~p received unexpected info: ~p", [?MODULE, Msg]),
352 |     {noreply, State}.
353 | 
354 | 
355 | %% @private
356 | -spec code_change(term(), #state{}, term()) ->
357 |     {ok, #state{}}.
358 | 
359 | code_change(_OldVsn, State, _Extra) ->
360 |     {ok, State}.
361 | 
362 | 
363 | %% @private
364 | -spec terminate(term(), #state{}) ->
365 |     ok.
366 | 
367 | terminate(_Reason, _State) ->
368 |     ok.
369 | 
370 | 
371 | 
372 | %% ===================================================================
373 | %% Internal
374 | %% ===================================================================
375 | 
376 | 
377 | %% @private
378 | get_stats() ->
379 |     Stats = #{
380 |         cpu => 
381 |             #{
382 |                 nprocs => cpu_sup:nprocs(),
383 |                 avg1 => cpu_sup:avg1() / 256,
384 |                 avg5 => cpu_sup:avg5() / 256,
385 |                 avg15 => cpu_sup:avg15() / 256
386 |             },
387 |         memory =>
388 |             maps:from_list(memsup:get_system_memory_data()),
389 |         disks => 
390 |             [
391 |                 #{path=>Path, size=>Size div 1024, user=>Use}
392 |                 || {Path, Size, Use} <- disksup:get_disk_data()
393 |             ],
394 |         time => 
395 |             nklib_util:l_timestamp()
396 |     },
397 |     gen_server:cast(?MODULE, {send_update, Stats}).
398 | 
399 | 
400 | %% @private
401 | -spec send_update(#state{}) ->
402 |     ok | {error, term()}.
403 | 
404 | send_update(#state{stats=Stats, listen=Listen, status=Status, os_type=OsType}) ->
405 |     Update = #{
406 |         status => Status,
407 |         listen => Listen, 
408 |         meta => nkcluster_app:get(meta),
409 |         stats => Stats#{os_type=>OsType}
410 |     },
411 |     nkcluster_protocol:send_event(nkcluster, {agent_update, Update}).
412 | 
413 | 
414 | %% @private
415 | connect_and_announce(#state{connecting1=false}=State) ->
416 |     #state{pref_addrs=Pref, cluster_addrs=Cluster} = State,
417 |     Addrs = Pref ++ nklib_util:randomize(Cluster),
418 |     Self = self(),
419 |     Fun = fun() ->
420 |         case do_connect(worker, Self, Addrs) of
421 |             {ok, Pid, _NodeId, _Info} ->
422 |                 nkcluster_protocol:send_announce([Pid]);
423 |             {error, Error} ->
424 |                 lager:info("NkCLUSTER agent could not connect to any control node: ~p", 
425 |                            [Error])
426 |         end,
427 |         gen_server:cast(Self, {connecting, false})
428 |     end,
429 |     spawn_link(Fun),
430 |     State#state{connecting1=true};
431 | 
432 | connect_and_announce(State) ->
433 |     State.
434 | 
435 | 
436 | %% @private
437 | do_connect(_Type, _Host, []) ->
438 |     {error, no_connections};
439 | 
440 | do_connect(Type, Host, [{Conns, Opts}|Rest]) ->
441 |     ConnOpts = connect_opts(Type, Host, Opts),
442 |     lager:info("NkCLUSTER agent connecting to ~p (~p)", [Conns, Type]),
443 |     case catch nkpacket:connect(Conns, ConnOpts) of
444 |         {ok, Pid} ->
445 |             case nkcluster_protocol:wait_auth(Pid) of
446 |                 {ok, NodeId, #{remote:=Remote}=Info} -> 
447 |                     lager:info("NkCLUSTER agent connected to ~s", [Remote]),
448 |                     {ok, Pid, NodeId, Info};
449 |                 {error, _} -> 
450 |                     do_connect(Type, Host, Rest)
451 |             end;
452 |         {error, Error} ->
453 |             lager:info("NkCLUSTER agent could not connect to ~p: ~p", [Conns, Error]),
454 |             do_connect(Type, Host, Rest);
455 |         {'EXIT', Error} ->
456 |             lager:info("NkCLUSTER agent could not connect to ~p: ~p", 
457 |                        [Conns, {exit, Error}]),
458 |             do_connect(Type, Host, Rest)
459 |     end.
460 | 
461 | 
462 | %% @private
463 | set_updated_status(Status, From, State) ->
464 |     nklib_proc:put(?MODULE, Status),
465 |     case From of
466 |         none -> ok;
467 |         _ -> gen_server:reply(From, ok)
468 |     end,
469 |     nkcluster_jobs:updated_status(Status),
470 |     nkcluster_protocol:send_event(nkcluster, {node_status, Status}),
471 |     {noreply, State#state{status=Status}}.
472 | 
473 | 
474 | %% @private
475 | -spec connect_opts(worker|{control, pid()}, pid(), map()) ->
476 |     map().
477 | 
478 | connect_opts(Type, Host, Opts) ->
479 |     TLSKeys = nkpacket_util:tls_keys(),
480 |     TLSOpts = maps:with([host|TLSKeys], Opts),
481 |     TLSOpts#{
482 |         srv_id => nkcluster,
483 |         valid_schemes => [nkcluster],
484 |         monitor => Host,
485 |         idle_timeout => 15000,
486 |         ws_proto => nkcluster,
487 |         tcp_packet => 4,
488 |         user => maps:with([password, type], Opts#{type=>Type})
489 |     }.
490 | 
491 | 
492 | %% @private
493 | has_no_addrs(#state{pref_addrs=Pref, cluster_addrs=Cluster}) ->
494 |     Pref==[] andalso Cluster==[].
495 | 
496 | 
497 | %% @private
498 | reset_ping_timer(#state{ping_timer=Timer}=State) ->
499 |     nklib_util:cancel_timer(Timer),
500 |     Time = 2 * nkcluster_app:get(ping_time),
501 |     State#state{ping_timer=erlang:send_after(Time, self(), ping_timeout)}.
502 |     
503 | -compile(export_all).
504 | 
505 | %% @private
506 | resolve(Uri, Opts) ->
507 |     Syntax = #{password=>binary},
508 |     Opts1 = Opts#{valid_schemes=>[nkcluster], syntax=>Syntax},
509 |     nkpacket:multi_resolve(Uri, Opts1).
510 |     
511 | 
512 | 


--------------------------------------------------------------------------------
/src/nkcluster_app.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc NkCLUSTER OTP Application Module
 22 | -module(nkcluster_app).
 23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 24 | -behaviour(application).
 25 | 
 26 | -export([start/0, start/1, start/2, stop/1]).
 27 | -export([get/1, put/2, del/1]).
 28 | 
 29 | -include("nkcluster.hrl").
 30 | -include_lib("nklib/include/nklib.hrl").
 31 | 
 32 | -define(APP, nkcluster).
 33 | 
 34 | -compile({no_auto_import, [get/1, put/2]}).
 35 | 
 36 | %% ===================================================================
 37 | %% Private
 38 | %% ===================================================================
 39 | 
 40 | %% @doc Starts NkCLUSTER stand alone.
 41 | -spec start() -> 
 42 |     ok | {error, Reason::term()}.
 43 | 
 44 | start() ->
 45 |     start(temporary).
 46 | 
 47 | 
 48 | %% @doc Starts NkCLUSTER stand alone.
 49 | -spec start(permanent|transient|temporary) -> 
 50 |     ok | {error, Reason::term()}.
 51 | 
 52 | start(Type) ->
 53 |     nkdist_util:ensure_dir(),
 54 |     case nklib_util:ensure_all_started(?APP, Type) of
 55 |         {ok, _Started} ->
 56 |             ok;
 57 |         Error ->
 58 |             Error
 59 |     end.
 60 | 
 61 | %% @private OTP standard start callback
 62 | start(_Type, _Args) ->
 63 |     Syntax = nkcluster_syntax:app_syntax(),
 64 |     Defaults = nkcluster_syntax:app_defaults(),
 65 |     case nklib_config:load_env(?APP, Syntax, Defaults) of
 66 |         {ok, Opts} ->
 67 |             TLSKeys = nkpacket_util:tls_keys(),
 68 |             TLSOpts = maps:with(TLSKeys, nklib_util:to_map(Opts)),
 69 |             put(tls_opts, TLSOpts),
 70 |             nkpacket:register_protocol(nkcluster, nkcluster_protocol),
 71 |             check_uris(get(cluster_addr)),
 72 |             check_uris(get(listen)),
 73 |             %% It is NOT recommended that you fix the NodeId!
 74 |             NodeId = case get(node_id) of
 75 |                 Bin when is_binary(Bin), byte_size(Bin) > 0 -> Bin;
 76 |                 _ -> nklib_util:luid()
 77 |             end,
 78 |             nklib_config:put(?APP, node_id, NodeId),
 79 |             {ok, Vsn} = application:get_key(?APP, vsn),
 80 |             case get(type) of
 81 |                 primary -> ok = nkdist_app:start();
 82 |                 secondary -> ok
 83 |             end,
 84 |             lager:notice("NkCLUSTER v~s node ~s is starting (cluster '~s', ~p)", 
 85 |                          [Vsn, NodeId, get(cluster_name), get(type)]),
 86 |             lager:notice("NkCLUSTER listening on ~s", 
 87 |                          [nklib_unparse:uri(get(listen))]),
 88 |             case nklib_unparse:uri(get(cluster_addr)) of
 89 |                 <<>> -> 
 90 |                     ok;
 91 |                 Addrs -> 
 92 |                     lager:notice("NkCLUSTER cluster ~s addresses: ~s", 
 93 |                                  [get(cluster_name), Addrs])
 94 |             end,
 95 |             case nklib_unparse:token(get(meta)) of
 96 |                 <<>> -> ok;
 97 |                 Meta -> lager:notice("NkCLUSTER metadata: ~s", [Meta])
 98 |             end,
 99 |             nkcluster_sup:start_link();
100 |         {error, Error} ->
101 |             lager:error("Error parsing config: ~p", [Error]),
102 |             error(Error)
103 |     end.
104 | 
105 | 
106 | %% @private OTP standard stop callback
107 | stop(_) ->
108 |     ok.
109 | 
110 | 
111 | %% @doc gets a configuration value
112 | get(Key) ->
113 |     get(Key, undefined).
114 | 
115 | 
116 | %% @doc gets a configuration value
117 | get(Key, Default) ->
118 |     nklib_config:get(?APP, Key, Default).
119 | 
120 | 
121 | %% @doc updates a configuration value
122 | put(Key, Value) ->
123 |     nklib_config:put(?APP, Key, Value).
124 | 
125 | 
126 | %% @doc updates a configuration value
127 | del(Key) ->
128 |     nklib_config:del(?APP, Key).
129 | 
130 | 
131 | %% @private
132 | check_uris(Uris) ->
133 |     case nkpacket:multi_resolve(Uris, #{valid_schemes=>[nkcluster]}) of
134 |         {ok, _} -> 
135 |             ok;
136 |         {error, _} -> 
137 |             lager:error("Error parsing config: invalid_uri ~p", [Uris]),
138 |             error(invalid_uri)
139 |     end.
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/src/nkcluster_job_class.erl:
--------------------------------------------------------------------------------
 1 | %% -------------------------------------------------------------------
 2 | %%
 3 | %% Copyright (c) 2014 Carlos Gonzalez Florido.  All Rights Reserved.
 4 | %%
 5 | %% This file is provided to you under the Apache License,
 6 | %% Version 2.0 (the "License"); you may not use this file
 7 | %% except in compliance with the License.  You may obtain
 8 | %% a copy of the License at
 9 | %%
10 | %%   http://www.apache.org/licenses/LICENSE-2.0
11 | %%
12 | %% Unless required by applicable law or agreed to in writing,
13 | %% software distributed under the License is distributed on an
14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | %% KIND, either express or implied.  See the License for the
16 | %% specific language governing permissions and limitations
17 | %% under the License.
18 | %%
19 | %% -------------------------------------------------------------------
20 | 
21 | %% @doc Job Class Behaviour
22 | -module(nkcluster_job_class).
23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
24 | 
25 | -type from() :: nkcluster_protocol:from().
26 | 
27 | 
28 | %% ===================================================================
29 | %% Callbacks
30 | %% ===================================================================
31 | 
32 | %% @doc This callback is called at the worker side, when a new request has arrived.
33 | %% You must supply an inmediate response. 
34 | %% If you are going to spend more than a few miliseconds and don't want to block
35 | %% the network channel, can return 'defer' and call nkcluster_jobs:reply/2 at a
36 | %% later time. However, you should consider using a task instead.
37 | %% You should only use the {error, term()} form for class-wide errors, not specific
38 | %% problems using this specific request (use {reply, {error, ...}} or similar).
39 | -callback request(nkcluster:request(), from()) ->
40 |     {reply, nkcluster:reply()} | {error, term()} | defer.
41 |  
42 | 
43 | %% @doc This callback is called at the worker side, when a new task is requested to start.
44 | %% You should start a new Erlang process and return its pid(), and, optionally,
45 | %% a reply to send back to the caller.
46 | %% You must return {error, term()} if you couldn't start the process.
47 | %% NkCLUSTER will send an event {nkcluster, {task_started, TaskId}} to the class.
48 | %% When the process ends, an event {nkcluster, {task_stopped, Reason, TaskId}} 
49 | %% will be sent.
50 | -callback task(nkcluster:task_id(), nkcluster:task()) ->
51 |     {ok, pid()} | {ok, nkcluster:reply(), pid()} | {error, term()}.
52 | 
53 | 
54 | %% @doc This callback is called at the worker side, when a command is sent to a
55 | %% currenly running task. The pid() of the task is included.
56 | %% You must supply and inmediate response.
57 | %% If you are going to spend more than a few miliseconds and don't want to block
58 | %% the network channel, can return 'defer' and call nkcluster_jobs:reply/2 at a
59 | %% later time. However, you should consider using events instead.
60 | %% You should only use the {error, term()} form for class-wide errors, not specific
61 | %% problems using this specific request (use {reply, {error, ...}} or similar).
62 | -callback command(pid(), nkcluster:command(), from()) ->
63 |     {reply, nkcluster:reply()} | {error, term()} | defer.
64 | 
65 | 
66 | %% @doc This callback is called at the worker side, for each started task, 
67 | %% when the node status changes.
68 | %% If the status changes to stopping, the task should stop as soon as possible,
69 | %% in order to shut down the node.
70 | -callback status(pid(), nkcluster:node_status()) ->
71 |     ok.
72 | 
73 | 
74 | %% @doc This callback is called at the control side, when an event is sent from
75 | %% a task or request at the worker side.
76 | %% NkCLUSTER will also sent the following events:
77 | %%
78 | %% - {nkcluster, {task_started, TaskId}
79 | %%   Called when a new task has started at the remote node
80 | %%
81 | %% - {nkcluster, {task_stopped, TaskId, Reason}}
82 | %%   Called when a task has stopped at the remote node
83 | %%
84 | %% - {nkcluster, connection_error}
85 | %%	 It means that the communication channel has failed, and some events could
86 | %%   have been lost. You should try to recover the state from the worker node
87 | %%   sending requests to it.
88 | %%
89 | -callback event(nkcluster:node_id(), term()) ->
90 |     ok.
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/src/nkcluster_jobs.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc NkCLUSTER Job Management
 22 | %% This server manages all worker tasks
 23 | -module(nkcluster_jobs).
 24 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 25 | -behaviour(gen_server).
 26 | 
 27 | -export([request/3, task/4, command/4, reply/2, send_event/2, get_tasks/0, get_tasks/1]).
 28 | -export([get_task/2, updated_status/1]).
 29 | -export([start_link/0, init/1, terminate/2, code_change/3, handle_call/3,   
 30 |             handle_cast/2, handle_info/2]).
 31 | 
 32 | 
 33 | %% ===================================================================
 34 | %% Public
 35 | %% ===================================================================
 36 | 
 37 | %% @private
 38 | -spec request(nkcluster:job_class(), nkcluster:request(), nkcluster_protocol:from()) ->
 39 |     {reply, term()} | {error, term()} | defer.
 40 | 
 41 | request(Class, Spec, From) ->
 42 |     lager:debug("New REQ '~p': ~p", [Class, Spec]),
 43 |     try Class:request(Spec, From) of
 44 |         {reply, Reply} ->
 45 |             {reply, Reply};
 46 |         {error, Error} ->
 47 |             {error, Error};
 48 |         defer ->
 49 |             defer
 50 |     catch
 51 |         C:E->
 52 |             {error, {{C, E}, erlang:get_stacktrace()}}
 53 |     end.
 54 | 
 55 | 
 56 | %% @private
 57 | -spec task(nkcluster:job_class(), nkcluster:task_id(), nkcluster:task(), 
 58 |            nkcluster_protocol:from()) ->
 59 |     {reply, term()} | {error, term()}.
 60 | 
 61 | task(Class, TaskId, Spec, _From) ->
 62 |     lager:debug("New TASK '~p' ~s: ~p", [Class, TaskId, Spec]),
 63 |     case nkcluster_agent:get_status() of
 64 |         ready ->
 65 |             case get_task(Class, TaskId) of
 66 |                 not_found ->
 67 |                     try Class:task(TaskId, Spec) of
 68 |                         {ok, Pid} when is_pid(Pid) ->
 69 |                             task_started(Class, TaskId, Pid),
 70 |                             {reply, ok};
 71 |                         {ok, Reply, Pid} when is_pid(Pid) ->
 72 |                             task_started(Class, TaskId, Pid),
 73 |                             {reply, {ok, Reply}};
 74 |                         {error, Error} ->
 75 |                             {error, Error}
 76 |                     catch
 77 |                         C:E->
 78 |                             {error, {{C, E}, erlang:get_stacktrace()}}
 79 |                     end;
 80 |                 {ok, _Pid} ->
 81 |                     {error, already_started}
 82 |             end;
 83 |         Status ->
 84 |             {error, {node_not_ready, Status}}
 85 |     end.
 86 | 
 87 | 
 88 | %% @private
 89 | -spec command(nkcluster:job_class(), nkcluster:task_id(), term(), 
 90 |               nkcluster_protocol:from()) ->
 91 |     {reply, term()} | {error, term()} | defer.
 92 | 
 93 | command(Class, TaskId, Spec, From) ->
 94 |     case get_task(Class, TaskId) of
 95 |         {ok, Pid} ->
 96 |             lager:debug("New Cmd '~p' ~s: ~p (~p)", [Class, TaskId, Spec, Pid]),
 97 |             try Class:command(Pid, Spec, From) of
 98 |                 {reply, Reply} ->
 99 |                     {reply, Reply};
100 |                 {error, Error} ->
101 |                     {error, Error};
102 |                 defer ->
103 |                     defer
104 |             catch
105 |                 C:E ->
106 |                     {error, {{C, E}, erlang:get_stacktrace()}}
107 |             end;
108 |         not_found ->
109 |             {error, task_not_found}
110 |     end.
111 | 
112 | 
113 | %% @private
114 | -spec get_task(nkcluster:job_class(), nkcluster:task_id()) ->
115 |     {ok, pid()} | not_found.
116 | 
117 | get_task(Class, TaskId) ->
118 |     gen_server:call(?MODULE, {get_task, Class, TaskId}).
119 | 
120 | 
121 | %% @private
122 | -spec task_started(nkcluster:job_class(), nkcluster:task_id(), pid()) ->
123 |     ok.
124 | 
125 | task_started(Class, TaskId, Pid) ->
126 |     ok = gen_server:call(?MODULE, {task_started, Class, TaskId, Pid}).
127 | 
128 | 
129 | %% @doc
130 | -spec get_tasks() ->
131 |     {ok, [{nkcluster:job_class(), nkcluster:task_id(), pid()}]}.
132 | 
133 | get_tasks() ->
134 |     gen_server:call(?MODULE, get_tasks).
135 | 
136 | 
137 | -spec get_tasks(nkcluster:job_class()) ->
138 |     {ok, [{nkcluster:task_id(), pid()}]}.
139 | 
140 | get_tasks(Class) ->
141 |     gen_server:call(?MODULE, {get_tasks, Class}).
142 | 
143 | 
144 | %% @doc
145 | -spec reply(nkcluster_protocol:from(), {reply, nkcluster:reply()} | {error, term()}) ->
146 |     ok | {error, term()}.
147 | 
148 | reply({ConnId, TransId}, {Class, Reply}) when Class==reply; Class==error ->
149 |     nkcluster_protocol:send_reply(ConnId, TransId, {Class, Reply}).
150 |     
151 | 
152 | %% @doc
153 | -spec send_event(nkcluster:job_class(), nkcluster:event()) ->
154 |     ok | {error, term()}.
155 | 
156 | send_event(Class, Event) ->
157 |     nkcluster_protocol:send_event(Class, Event).
158 | 
159 | 
160 | %% @private
161 | -spec updated_status(nkcluster:node_status()) ->
162 |     ok.
163 | 
164 | updated_status(Status) ->
165 |     gen_server:cast(?MODULE, {updated_status, Status}).
166 | 
167 | 
168 | 
169 | %% ===================================================================
170 | %% gen_server
171 | %% ===================================================================
172 | 
173 | 
174 | %% @private
175 | start_link() ->
176 |     gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
177 | 
178 | 
179 | -record(state, {
180 |     tasks :: #{{nkcluster:job_class(), nkcluster:task_id()} => pid()},
181 |     pids :: #{pid() => {nkcluster:job_class(), nkcuster:task_id()}}
182 | }).
183 | 
184 | 
185 | %% @private 
186 | -spec init(term()) ->
187 |     {ok, #state{}}.
188 | 
189 | init([]) ->
190 |     process_flag(trap_exit, true),
191 |     State = #state{tasks=#{}, pids=#{}},
192 |     {ok, State}.
193 | 
194 | %% @private
195 | -spec handle_call(term(), {pid(), term()}, #state{}) ->
196 |     {reply, term(), #state{}} | {noreply, #state{}}.
197 | 
198 | handle_call(get_tasks, _From, #state{tasks=Tasks}=State) ->
199 |     {reply, {ok, maps:keys(Tasks)}, State};
200 | 
201 | handle_call({get_tasks, Class}, _From, #state{tasks=Tasks}=State) ->
202 |     Data = [TaskId || {C, TaskId} <- maps:keys(Tasks), C==Class],
203 |     {reply, {ok, Data}, State};
204 | 
205 | handle_call({get_task, Class, TaskId}, _From, #state{tasks=Tasks} = State) ->
206 |     case maps:get({Class, TaskId}, Tasks, undefined) of
207 |         Pid when is_pid(Pid) ->
208 |             {reply, {ok, Pid}, State};
209 |         undefined ->
210 |             {reply, not_found, State}
211 |     end;
212 | 
213 | handle_call({task_started, Class, TaskId, Pid}, From, State) ->
214 |     #state{tasks=Tasks, pids=Pids} = State,
215 |     gen_server:reply(From, ok),
216 |     case maps:is_key({Class, TaskId}, Tasks) of
217 |         true ->
218 |             lager:warning("Started duplicated job '~p' ~s", [Class, TaskId]);
219 |         false ->
220 |             ok
221 |     end,
222 |     monitor(process, Pid),
223 |     Tasks1 = maps:put({Class, TaskId}, Pid, Tasks),
224 |     Pids1 = maps:put(Pid, {Class, TaskId}, Pids),
225 |     case send_event(Class, {nkcluster, {task_started, TaskId}}) of
226 |         ok -> 
227 |             ok;
228 |         {error, Error} -> 
229 |             lager:notice("NkCLUSER Jobs could not send event: ~p", [Error])
230 |     end,
231 |     {noreply, State#state{tasks=Tasks1, pids=Pids1}};
232 | 
233 | handle_call(get_state, _From, State) ->
234 |     {reply, State, State};
235 | 
236 | handle_call(Msg, _From, State) ->
237 |     lager:error("Module ~p received unexpected call: ~p", [?MODULE, Msg]),
238 |     {noreply, State}.
239 | 
240 | 
241 | %% @private
242 | -spec handle_cast(term(), #state{}) ->
243 |     {noreply, #state{}}.
244 | 
245 | handle_cast({updated_status, Status}, #state{tasks=Tasks}=State) ->
246 |     Data = [{Class, Pid} || {{Class, _TaskId}, Pid} <- maps:to_list(Tasks)],
247 |     spawn_link(fun() -> send_updated_status(Status, Data) end),
248 |     {noreply, State};
249 |     
250 | 
251 | handle_cast(Msg, State) ->
252 |     lager:error("Module ~p received unexpected cast: ~p", [?MODULE, Msg]),
253 |     {noreply, State}.
254 | 
255 | 
256 | %% @private
257 | -spec handle_info(term(), #state{}) ->
258 |     {noreply, #state{}}.
259 | 
260 | handle_info({'DOWN', _Ref, process, Pid, Reason}=Msg, State) ->
261 |     #state{tasks=Tasks, pids=Pids} = State,
262 |     case maps:get(Pid, Pids, undefined) of
263 |         {Class, TaskId} ->
264 |             Tasks1 = maps:remove({Class, TaskId}, Tasks),
265 |             Pids1 = maps:remove(Pid, Pids),
266 |             case send_event(Class, {nkcluster, {task_stopped, TaskId, Reason}}) of
267 |                 ok -> 
268 |                     ok;
269 |                 {error, Error} -> 
270 |                     lager:notice("NkCLUSER Jobs could not send event: ~p", [Error])
271 |             end,
272 |             {noreply, State#state{tasks=Tasks1, pids=Pids1}};
273 |         undefined ->
274 |             lager:error("Module ~p received unexpected info: ~p", [?MODULE, Msg]),
275 |             {noreply, State}
276 |     end;
277 |     
278 | handle_info({'EXIT', Pid, _Reason}, #state{pids=Pids}=State) ->
279 |     case maps:is_key(Pid, Pids) of
280 |         true -> 
281 |             ok;
282 |         false -> 
283 |             lager:info("Module ~p received unexpected EXIT: ~p", [?MODULE, Pid])
284 |     end,
285 |     {noreply, State};
286 | 
287 | handle_info(Msg, State) ->
288 |     lager:error("Module ~p received unexpected info: ~p", [?MODULE, Msg]),
289 |     {noreply, State}.
290 | 
291 | 
292 | %% @private
293 | -spec code_change(term(), #state{}, term()) ->
294 |     {ok, #state{}}.
295 | 
296 | code_change(_OldVsn, State, _Extra) ->
297 |     {ok, State}.
298 | 
299 | 
300 | %% @private
301 | -spec terminate(term(), #state{}) ->
302 |     ok.
303 | 
304 | terminate(_Reason, _State) ->
305 |     ok.
306 | 
307 | 
308 | 
309 | %% ===================================================================
310 | %% Internal
311 | %% ===================================================================
312 | 
313 | 
314 | %% @private
315 | send_updated_status(_Status, []) ->
316 |     ok;
317 | 
318 | send_updated_status(Status, [{Class, Pid}|Rest]) ->
319 |     catch Class:status(Pid, Status),
320 |     send_updated_status(Status, Rest).
321 | 
322 | 
323 | 


--------------------------------------------------------------------------------
/src/nkcluster_jobs_core.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc NkCLUSTER Core Worker Processes
 22 | -module(nkcluster_jobs_core).
 23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 24 | -behaviour(nkcluster_job_class).
 25 | 
 26 | -export([request/2, task/2, command/3, status/2, event/2]).
 27 | 
 28 | -include("nkcluster.hrl").
 29 | 
 30 | 
 31 | %% ===================================================================
 32 | %% nkcluster_jobs
 33 | %% ===================================================================
 34 | 
 35 | %% @private
 36 | -spec request(term(), nkcluster_protocol:from()) ->
 37 |     {reply, term()} | {error, term()} | defer.
 38 | 
 39 | request(get_status, _From) ->
 40 |     {reply, nkcluster_agent:get_status()};
 41 | 
 42 | request({set_status, Status}, _From) ->
 43 |     case nkcluster_agent:set_status(Status) of
 44 |         ok -> {reply, ok};
 45 |         {error, Error} -> {error, Error}
 46 |     end;
 47 | 
 48 | request(get_meta, _From) ->
 49 |     {reply, nkcluster_app:get(meta)};
 50 | 
 51 | request({update_meta, Meta}, _From) ->
 52 |     OldMeta = nkcluster_app:get(meta),
 53 |     Meta1 = nklib_util:store_values(Meta, OldMeta),
 54 |     nkcluster_app:put(meta, Meta1),
 55 |     {reply, Meta1};
 56 | 
 57 | request({remove_meta, Keys}, _From) ->
 58 |     OldMeta = nkcluster_app:get(meta),
 59 |     Meta1 = nklib_util:remove_values(Keys, OldMeta),
 60 |     nkcluster_app:put(meta, Meta1),
 61 |     {reply, Meta1};
 62 | 
 63 | request({get_data, Key}, _From) ->
 64 |     Value = nkcluster_app:get({data, Key}),
 65 |     {reply, Value};
 66 | 
 67 | request({put_data, Key, Val}, _From) ->
 68 |     ok = nkcluster_app:put({data, Key}, Val),
 69 |     {reply, ok};
 70 | 
 71 | request({del_data, Key}, _From) ->
 72 |     ok = nkcluster_app:del({data, Key}),
 73 |     {reply, ok};
 74 | 
 75 | request(get_tasks, _From) ->
 76 |     {ok, Jobs} = nkcluster_jobs:get_tasks(),
 77 |     {reply, Jobs};
 78 | 
 79 | request({get_tasks, Class}, _From) ->
 80 |     {ok, Jobs} = nkcluster_jobs:get_tasks(Class),
 81 |     {reply, Jobs};
 82 | 
 83 | request({call, Mod, Fun, Args}, _From) ->
 84 |     {reply, catch apply(Mod, Fun, Args)};
 85 | 
 86 | request({spawn_call, Mod, Fun, Args}, From) ->
 87 |     spawn_link(
 88 |         fun() ->
 89 |             Reply = (catch apply(Mod, Fun, Args)),
 90 |             nkcluster_jobs:reply(From, {reply, Reply})
 91 |         end),
 92 |     defer;
 93 | 
 94 | request({write_file, Path, Bin}, _From) ->
 95 |     Path1 = nklib_util:to_list(Path),
 96 |     case filelib:ensure_dir(Path1) of
 97 |         ok ->
 98 |             case file:write_file(Path1, Bin) of
 99 |                 ok -> 
100 |                     {reply, ok};
101 |                 {error, Error} ->
102 |                     {error, {write_error, Error}}
103 |             end;
104 |         {error, Error} ->
105 |             {error, {write_error, Error}}
106 |     end;
107 | 
108 | request({load_code, Data}, From) ->
109 |     spawn_link(fun() -> load_code(Data, From) end),
110 |     defer;
111 | 
112 | request(_, _From) ->
113 |     {error, unknown_request}.
114 | 
115 | 
116 | %% @private
117 | -spec task(nkcluster:task_id(), term()) ->
118 |     {ok, pid()} | {ok, term(), pid()} | {error, term()}.
119 | 
120 | 
121 | task(_TaskId, {write_file, Path}) ->
122 |     Path1 = nklib_util:to_list(Path),
123 |     case filelib:ensure_dir(Path1) of
124 |         ok ->
125 |             case file:open(Path1, [write, binary]) of
126 |                 {ok, Device} ->
127 |                     Pid = spawn_link(fun() -> write_file(Device) end),
128 |                     {ok, Pid};
129 |                 {error, Error} ->
130 |                     {error, {write_error, Error}}
131 |             end;
132 |         {error, Error} ->
133 |             {error, {write_error, Error}}
134 |     end;
135 |             
136 | task(_TaskId, _) ->
137 |     {error, unknown_task}.
138 | 
139 | 
140 | %% @private
141 | -spec command(pid(), nkcluster:command(), nkcluster_protocol:from()) ->
142 |     {reply, ok} | defer.
143 | 
144 | command(Pid, {write_file, Data}, From) ->
145 |     Pid ! {write_file, Data, From},
146 |     defer;
147 | 
148 | command(_Pid, _Cmd, _From) ->
149 |     {reply, unknown_command}.
150 | 
151 | 
152 | -spec status(pid(), nkcluster:node_status()) ->
153 |     ok.
154 | 
155 | status(Pid, Status) ->
156 |     lager:info("Core process ~p notified status ~p", [Pid, Status]),
157 |     ok.
158 | 
159 | 
160 | %% @private
161 | -spec event(nkcluster:node_id(), nkcluster:event()) ->
162 |     ok.
163 | 
164 | event(_NodeId, _Data) ->
165 |     lager:info("Node ~s core event: ~p", [_NodeId, _Data]),
166 |     ok.
167 |     
168 | 
169 | 
170 | 
171 | %% ===================================================================
172 | %% gen_server
173 | %% ===================================================================
174 | 
175 | 
176 | %% @private
177 | write_file(Device) ->
178 |     receive 
179 |         {write_file, eof, From} ->
180 |             Reply = case file:close(Device) of
181 |                 ok -> 
182 |                     {reply, ok};
183 |                 {error, Error} -> 
184 |                     {error, {write_error, Error}}
185 |             end,
186 |             nkcluster_jobs:reply(From, Reply);
187 |         {write_file, Data, From} ->
188 |             case file:write(Device, Data) of
189 |                 ok ->
190 |                     nkcluster_jobs:reply(From, {reply, ok}),
191 |                     write_file(Device);
192 |                 {error, Error} ->
193 |                     file:close(Device),
194 |                     nkcluster_jobs:reply(From, {error, {write_error, Error}})
195 |             end
196 |     after 180000 ->
197 |         error(write_file_timeout)
198 |     end.
199 | 
200 |         
201 | 
202 | %% @private
203 | load_code([], From) ->
204 |     nkcluster_jobs:reply(From, {reply, ok});
205 | 
206 | load_code([{Mod, File, Bin}|Rest], From) ->
207 |     case code:load_binary(Mod, File, Bin) of
208 |         {module, Mod} ->
209 |             lager:info("Worker loaded module ~p", [Mod]),
210 |             load_code(Rest, From);
211 |         {error, Error} ->
212 |             nkcluster_jobs:reply(From, {error, {load_error, Error}})
213 |     end.
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------
/src/nkcluster_node_proxy.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc Remote Node Control
 22 | -module(nkcluster_node_proxy).
 23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 24 | -behaviour(nkdist_proc).
 25 | -behaviour(gen_server).
 26 | 
 27 | -export([start_link/2, stop/1, get_info/1, get_all/0]).
 28 | -export([rpc/3, new_connection/1]).
 29 | -export([received_resp/3, received_event/3]).
 30 | -export([start/2, start_and_join/2, join/2]).
 31 | -export([init/1, terminate/2, code_change/3, handle_call/3,
 32 |          handle_cast/2, handle_info/2]).
 33 | -export_type([conn_spec/0, start_opts/0, rpc_opts/0]).
 34 | 
 35 | -include_lib("nkpacket/include/nkpacket.hrl").
 36 | 
 37 | 
 38 | -type conn_spec() ::
 39 |     pid()|nklib:user_uri()|[pid()|nklib:user_uri()].
 40 | 
 41 | -type start_opts() ::
 42 |     #{
 43 |         connect => conn_spec(),
 44 |         password => binary(),
 45 |         launch => launch_opts(),
 46 |         ?TLS_TYPES
 47 |     }.
 48 | 
 49 | -type launch_opts() ::
 50 |     #{
 51 |     }.
 52 | 
 53 | 
 54 | -type rpc_opts() ::
 55 |    #{
 56 |         conn_pid => pid(),
 57 |         timeout => pos_integer()
 58 |     }.
 59 | 
 60 | 
 61 | -define(MAX_TIME_DIFF, 5000).
 62 | -define(DEF_REQ_TIME, 30000).
 63 | 
 64 | -include_lib("nklib/include/nklib.hrl").
 65 | 
 66 | -define(CLLOG(Type, Msg, Vars, State), 
 67 |     lager:Type("NkCLUSTER proxy ~s (~s) " Msg, 
 68 |                [State#state.node_id, State#state.conn_id|Vars])).
 69 | 
 70 | -define(TIMEOUT, 5*60*1000).
 71 | 
 72 | 
 73 | %% ===================================================================
 74 | %% Public
 75 | %% ===================================================================
 76 | 
 77 | 
 78 | %% @doc Starts a new node control process
 79 | %% If the key 'connect' is provided, it is supposed that the node is already
 80 | %% started, and we will try to connect to it.
 81 | %% If the key 'launch' is provided, the node will be started, using one of the
 82 | %% supported providers
 83 | -spec start_link(nkcluster:node_id(), start_opts()) ->
 84 |     {ok, pid()} | {error, term()}.
 85 | 
 86 | start_link(NodeId, Opts) ->
 87 |     gen_server:start_link(?MODULE, {NodeId, Opts}, []).
 88 | 
 89 | 
 90 | %% @doc Forces the stop of an started worker
 91 | -spec stop(pid()) ->
 92 |     ok.
 93 | 
 94 | stop(Pid) ->
 95 |     gen_server:cast(Pid, stop).
 96 | 
 97 | 
 98 | %% @doc Gets info about a started worker
 99 | -spec get_info(pid()) ->
100 |     {ok, map()} | {error, term()}.
101 | 
102 | get_info(Pid) ->
103 |     do_call(Pid, get_info, ?TIMEOUT).
104 | 
105 | 
106 | %% @doc Gets all started workers
107 | -spec get_all() ->
108 |     [{nkcluster:node_id(), binary(), pid()}].
109 | 
110 | get_all() ->
111 |     [
112 |         {NodeId, ConnId, Pid} ||
113 |         {{NodeId, ConnId}, Pid} <- nklib_proc:values(?MODULE)
114 |     ].
115 | 
116 | 
117 | %% @private Sends a remote request
118 | -spec rpc(pid(), nkcluster_protocol:rpc(), rpc_opts()) ->
119 |     {reply, nkcluster:reply()} | {error, term()}.
120 | 
121 | rpc(Pid, Cmd, Opts) ->
122 |     do_call(Pid, {rpc, Cmd, Opts}, ?TIMEOUT).
123 | 
124 | 
125 | %% @private Sends a synchronous request
126 | -spec new_connection(pid()) ->
127 |     {ok, pid()} | {error, term()}.
128 | 
129 | new_connection(Pid) ->
130 |     do_call(Pid, new_connection, ?TIMEOUT).
131 | 
132 | 
133 | %% @private Called when a response is received
134 | -spec received_resp(pid(), nkcluster_protocol:trans_id(), 
135 |                     {reply, nkcluster:reply()} | {error, term()}) ->
136 |     ok.
137 | 
138 | received_resp(Pid, TransId, Reply) ->
139 |     gen_server:cast(Pid, {resp, TransId, Reply}).
140 | 
141 | 
142 | %% @private Called when an event is received
143 | -spec received_event(pid(), nkcluster:job_class(), nkcluster:event()) ->
144 |     ok.
145 | 
146 | received_event(Pid, Class, Event) ->
147 |     gen_server:cast(Pid, {event, Class, Event}).
148 | 
149 | 
150 | % ===================================================================
151 | %% nkdist_proc behaviour
152 | %% ===================================================================
153 | 
154 | 
155 | %% @doc Start a new process
156 | -spec start(nkcluster:node_id(), start_opts()) ->
157 |     {ok, pid()} | {error, term()}.
158 | 
159 | start(NodeId, Opts) ->
160 |     start_link(NodeId, Opts).
161 | 
162 | 
163 | %% @doc Starts a new clone process
164 | -spec start_and_join(nkcluster:node_id(), pid()) ->
165 |     {ok, pid()} | {error, term()}.
166 | 
167 | start_and_join(NodeId, Pid) ->
168 |     start_link(NodeId, #{clone=>Pid}).
169 | 
170 | 
171 | %% @doc Joins two existing processes
172 | -spec join(Current::pid(), Old::pid()) ->
173 |     ok | {error, term()}.
174 | 
175 | join(Pid, OldPid) ->
176 |     gen_server:call(Pid, {join, OldPid}).
177 | 
178 | 
179 | 
180 | % ===================================================================
181 | %% gen_server behaviour
182 | %% ===================================================================
183 | 
184 | 
185 | -record(req, {
186 |     from :: {pid(), reference()} | ping,
187 |     timer :: reference(),
188 |     conn_pid :: pid()
189 | }).
190 | 
191 | 
192 | -record(state, {
193 |     node_id :: nkcluster:node_id(),
194 |     conn_id :: binary(),
195 |     conn_pid :: pid(),
196 |     connect = [] :: [pid()|nklib:user_uri()],
197 |     status = not_connected :: nkcluster:node_status(),
198 |     connected = false :: boolean(),
199 |     meta = [] :: [nklib:token()],
200 |     latencies = [] :: [integer()],
201 |     rpcs = #{} :: #{nkcluster_protocol:trans_id() => #req{}},
202 |     classes = [] :: [module()],
203 |     opts :: map()
204 | }).
205 | 
206 | 
207 | %% @private 
208 | -spec init(term()) ->
209 |     {ok, #state{}}.
210 | 
211 | init({NodeId, #{connect:=Connect0}=Opts}) ->
212 |     Connect = case is_list(Connect0) of
213 |         false -> [Connect0];
214 |         true when is_integer(hd(Connect0)) -> [Connect0];
215 |         true -> Connect0
216 |     end,
217 |     TLSKeys = nkpacket_util:tls_keys(),
218 |     State = #state{
219 |         node_id = NodeId,
220 |         conn_id = <<"unknown remote">>, 
221 |         connect = Connect, 
222 |         opts = maps:with([password|TLSKeys], Opts)
223 |     },
224 |     ?CLLOG(info, "starting (~p)", [self()], State),
225 |     self() ! connect,
226 |     self() ! send_ping,
227 |     {ok, State};
228 | 
229 | init({NodeId, #{launch:=_Launch}=Opts}) ->
230 |     State = #state{
231 |         node_id = NodeId, 
232 |         conn_id = <<"unknown remote">>, 
233 |         opts = Opts
234 |     },
235 |     ?CLLOG(info, "launching (~p)", [self()], State),
236 |     self() ! launch,
237 |     {ok, State};
238 | 
239 | init({NodeId, #{clone:=Pid}}) ->
240 |     case gen_server:call(Pid, freeze, 60000) of
241 |         {ok, Data} ->
242 |             #{
243 |                 node_id := NodeId,
244 |                 connect := Connect,
245 |                 meta := Meta,
246 |                 classes := Classes,
247 |                 opts := Opts
248 |             } = Data,
249 |             State = #state{
250 |                 node_id = NodeId,
251 |                 conn_id = <<"unknown remote">>,
252 |                 connect = Connect,
253 |                 meta = Meta,
254 |                 classes = Classes,
255 |                 opts = Opts
256 |             },
257 |             ?CLLOG(notice, "cloned from ~p (~p)", [Pid, self()], State),
258 |             self() ! connect,
259 |             self() ! send_ping,
260 |             {ok, State};
261 |         _ ->
262 |             {stop, could_not_clone}
263 |     end.
264 | 
265 | 
266 | %% @private
267 | -spec handle_call(term(), {pid(), term()}, #state{}) ->
268 |     {noreply, #state{}} | {reply, term(), #state{}}.
269 | 
270 | handle_call(new_connection, From, #state{connected=true}=State) ->
271 |     #state{connect=Connect, conn_pid=ConnPid, opts=Opts} = State,
272 |     Self = self(),
273 |     Opts1 = Opts#{idle_timeout=>5000},
274 |     spawn_link(
275 |         fun() -> 
276 |             Reply = case do_connect(Connect, Self, Opts1) of
277 |                 {ok, NewConnPid, _NodeId, _Info} -> 
278 |                     gen_server:cast(Self, {new_connection, NewConnPid}),
279 |                     {ok, NewConnPid};
280 |                 error -> 
281 |                     {ok, ConnPid}
282 |             end,
283 |             gen_server:reply(From, Reply)
284 |         end),
285 |     {noreply, State};
286 | 
287 | handle_call({new_connection, _New}, _From, State) ->
288 |     {reply, {error, not_connected}, State};
289 | 
290 | handle_call(get_info, _From, State) ->
291 |     #state{
292 |         node_id = NodeId,
293 |         conn_id = ConnId,
294 |         connect = Connect,
295 |         conn_pid = ConnPid,
296 |         meta = Meta,
297 |         status = Status,
298 |         latencies = Latencies
299 |     } = State,
300 |     Lat = case length(Latencies) of
301 |         0 -> 0;
302 |         _ -> lists:sum(Latencies) div length(Latencies)
303 |     end,
304 |     Info = #{
305 |         control_pid => self(),
306 |         node_id => NodeId,
307 |         conn_id => ConnId,
308 |         listen => Connect,
309 |         conn_pid => ConnPid,
310 |         meta => Meta,
311 |         status => Status,
312 |         latency => Lat
313 |     },
314 |     {reply, {ok, Info}, State};
315 | 
316 | handle_call({rpc, Msg, Opts}, From, #state{connected=true}=State) ->
317 |     case send_rpc(Msg, Opts, From, State) of
318 |         {ok, State1} -> 
319 |             {noreply, State1};
320 |         {error, State1} ->
321 |             {reply, {error, connection_failed}, State1}
322 |     end;
323 | 
324 | handle_call({req, _Msg, _Opts}, _From, State) ->
325 |     {reply, {error, not_connected}, State};
326 | 
327 | handle_call(freeze, From, State) ->
328 |     #state{node_id=NodeId, connect=Connect, meta=Meta, 
329 |            classes=Classes, opts=Opts} = State,    
330 |     Data = #{
331 |         node_id => NodeId, 
332 |         connect => Connect, 
333 |         meta => Meta, 
334 |         classes => Classes, 
335 |         opts => Opts
336 |     },
337 |     gen_server:reply(From, {ok, Data}),
338 |     {stop, normal, State};
339 | 
340 | handle_call({join, OldPid}, From, State) ->
341 |     spawn(
342 |         fun() ->
343 |             Reply = case gen_server:call(OldPid, freeze, 60000) of
344 |                 {ok, _} -> ok;
345 |                 _ -> {error, could_not_join}
346 |             end,
347 |             gen_server:reply(From, Reply)
348 |         end),
349 |     {noreply, State};
350 | 
351 | handle_call(Msg, _From, State) -> 
352 |     lager:error("Module ~p received unexpected call ~p", [?MODULE, Msg]),
353 |     {noreply, State}.
354 | 
355 | 
356 | %% @private
357 | -spec handle_cast(term(), #state{}) ->
358 |     {noreply, #state{}} | {stop, normal, #state{}}.
359 | 
360 | handle_cast({resp, TransId, Reply}, #state{rpcs=Rpcs}=State) ->
361 |     case maps:get(TransId, Rpcs, undefined) of
362 |         #req{from=From, timer=Timer} ->
363 |             nklib_util:cancel_timer(Timer),
364 |             do_reply(From, Reply),
365 |             Rpcs1 = maps:remove(TransId, Rpcs),
366 |             {noreply, State#state{rpcs=Rpcs1}};
367 |         undefined ->
368 |             ?CLLOG(notice, "received unexpected response", [], State),
369 |             {noreply, State}
370 |     end;
371 | 
372 | handle_cast({event, Class, Event}, State) ->
373 |     #state{node_id=NodeId, classes=Classes} = State,
374 |     State1 = case Class/=nkcluster andalso (not lists:member(Class, Classes)) of
375 |         true -> State#state{classes=[Class|Classes]};
376 |         false -> State
377 |     end,
378 |     State2 = case {Class, Event} of
379 |         {nkcluster, {node_status, NodeStatus}} ->
380 |             update_status(NodeStatus, true, State1);
381 |         {nkcluster, {agent_update, Update}} ->
382 |             nkcluster_nodes:node_update(NodeId, self(), Update),
383 |             State1;
384 |         _ ->
385 |             send_event([Class], Event, State1),
386 |             State1
387 |     end,
388 |     {noreply, State2};
389 | 
390 | handle_cast({pong, {reply, {LocTime, RemTime}}}, #state{connected=true}=State) ->
391 |     #state{latencies=Latencies} = State,
392 |     Now = nklib_util:l_timestamp(),
393 |     Latency = (Now - LocTime),
394 |     Diff = abs(Now - RemTime) div 1000,
395 |     % lager:warning("LAT: ~p", [Latency div 1000]),
396 |     case Diff  > ?MAX_TIME_DIFF of
397 |         true ->
398 |             ?CLLOG(warning, "has too much time drift (~p msecs)", [Diff], State),
399 |             {stop, normal, State};
400 |         false ->
401 |             Latencies1 = Latencies++[Latency],
402 |             {noreply, State#state{latencies=Latencies1}}
403 |     end;
404 | 
405 | handle_cast({pong, {error, Error}}, #state{connected=true, conn_pid=ConnPid}=State) ->
406 |     ?CLLOG(notice, "ping failed: ~p", [Error], State),
407 |     nkcluster_protocol:stop(ConnPid),
408 |     {noreply, update_status(not_connected, false, State)};
409 | 
410 | handle_cast({new_connection, Pid}, State) ->
411 |     monitor(process, Pid),
412 |     {noreply, State};
413 | 
414 | handle_cast(stop, State) ->
415 |     {stop, normal, State};
416 | 
417 | handle_cast(Msg, State) -> 
418 |     lager:error("Module ~p received unexpected cast ~p", [?MODULE, Msg]),
419 |     {noreply, State}.
420 | 
421 | 
422 | %% @private
423 | -spec handle_info(term(), #state{}) ->
424 |     {noreply, #state{}} | {stop, normal, #state{}}.
425 | 
426 | handle_info(connect, #state{connected=true}=State) ->
427 |     ?CLLOG(warning, "ordered to reconnect in ok status", [], State),
428 |     {noreply, State};
429 | 
430 | handle_info(connect, State) ->
431 |     connect(State);
432 |     
433 | handle_info({'DOWN', _Ref, process, Pid, _Reason}, State) ->
434 |     #state{rpcs=Rpcs, conn_pid=ConnPid} = State,
435 |     ListRpcs1 = lists:filter(
436 |         fun({_TransId, #req{conn_pid=RpcPid, from=From}}) ->
437 |             case RpcPid/=Pid of    
438 |                 true ->
439 |                     true;
440 |                 false ->
441 |                     do_reply(From, {error, connection_failed}),
442 |                     false
443 |             end
444 |         end,
445 |         maps:to_list(Rpcs)),
446 |     State1 = State#state{rpcs=maps:from_list(ListRpcs1)},
447 |     case Pid of
448 |         ConnPid -> 
449 |             State2 = update_status(not_connected, false, State1),
450 |             connect(State2);
451 |         _ -> 
452 |             {noreply, State1}
453 |     end;
454 | 
455 | handle_info({req_timeout, TransId}, State) ->
456 |     handle_cast({resp, TransId, {error, timeout}}, State);
457 | 
458 | handle_info(send_ping, #state{connected=true}=State) ->
459 |     Now = nklib_util:l_timestamp(),
460 |     Cmd = {req, nkcluster, {ping, Now}},
461 |     Ping = ping_time(),
462 |     Timeout = Ping * 75 div 100,
463 |     State2 = case send_rpc(Cmd, #{timeout=>Timeout}, ping, State) of
464 |         {ok, State1} -> 
465 |             State1;
466 |         {error, State1} -> 
467 |             ?CLLOG(notice, "ping send error", [], State1),
468 |             State1
469 |     end,
470 |     erlang:send_after(Ping, self(), send_ping),
471 |     {noreply, State2};
472 | 
473 | handle_info(send_ping, State) ->
474 |     erlang:send_after(ping_time(), self(), send_ping),
475 |     {noreply, State};
476 | 
477 | handle_info(Info, State) -> 
478 |     lager:warning("Module ~p received unexpected info: ~p (~p)", [?MODULE, Info, State]),
479 |     {noreply, State}.
480 | 
481 | 
482 | %% @private
483 | -spec code_change(term(), #state{}, term()) ->
484 |     {ok, #state{}}.
485 | 
486 | code_change(_OldVsn, State, _Extra) ->
487 |     {ok, State}.
488 | 
489 | 
490 | %% @private
491 | -spec terminate(term(), #state{}) ->
492 |     ok.
493 | 
494 | terminate(Reason, State) ->  
495 |     update_status(not_connected, false, State),
496 |     ?CLLOG(debug, "terminating: ~p", [Reason], State).
497 | 
498 | 
499 | 
500 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Internal %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
501 | 
502 | 
503 | %% @private
504 | -spec connect(#state{}) ->
505 |     {noreply, #state{}} | {stop, normal, #state{}}.
506 | 
507 | connect(#state{connect=[]}=State) ->
508 |     ?CLLOG(notice, "exiting, no connections available", [], State),
509 |     {stop, normal, State};
510 | 
511 | connect(#state{node_id=NodeId, connect=Connect, opts=Opts}=State) ->
512 |     State1 = update_status(not_connected, false, State),
513 |     case do_connect(Connect, self(), Opts) of
514 |         {ok, ConnPid, NodeId, Info} ->
515 |             MyListen = nkcluster_app:get(listen),
516 |             case nkcluster_protocol:set_master(ConnPid, MyListen) of
517 |                 ok ->
518 |                     Connect1 = maps:get(listen, Info, []),
519 |                     case node(ConnPid)/=node() andalso Connect1/=[] of
520 |                         true ->
521 |                             % the remote node is already connected to another
522 |                             % primary node, and has some listening addresses,
523 |                             % stop that connection and connect again from here
524 |                             ?CLLOG(info, "switching connection to local", [], State),
525 |                             nkcluster_protocol:stop(ConnPid),
526 |                             connect(State1#state{connect=Connect1});
527 |                         false ->
528 |                             monitor(process, ConnPid),
529 |                             #{status:=NodeStatus, remote:=ConnId} = Info,
530 |                             nklib_proc:put(?MODULE, {NodeId, ConnId}),
531 |                             State2 = State1#state{
532 |                                 node_id = NodeId,
533 |                                 conn_id = ConnId,
534 |                                 connect = Connect1,
535 |                                 conn_pid = ConnPid,
536 |                                 meta = maps:get(meta, Info, #{}),
537 |                                 latencies = []
538 |                             },
539 |                             {noreply, update_status(NodeStatus, true, State2)}
540 |                     end;
541 |                 {error, _} ->
542 |                     connect_error(State1)
543 |             end;
544 |         {ok, _ConnPid, _OtherNodeId, _Info} ->
545 |             lager:warning("NkCLUSTER proxy connected to node with different NodeId!"),
546 |             {stop, normal, State};
547 |         error ->
548 |             connect_error(State1)
549 |     end.
550 | 
551 | 
552 | %% @private
553 | connect_error(#state{conn_pid=ConnPid}=State) ->
554 |     case is_pid(ConnPid) of
555 |         true ->
556 |             nkcluster_protocol:stop(ConnPid);
557 |         false ->
558 |             ok
559 |     end,
560 |     Time = nkcluster_app:get(proxy_connect_retry),
561 |     erlang:send_after(Time, self(), connect),
562 |     {noreply, update_status(not_connected, false, State)}.
563 | 
564 | 
565 | %% @private
566 | update_status(Status, Connected, State) ->
567 |     #state{
568 |         node_id = NodeId, 
569 |         status = OldStatus, 
570 |         connect = Listen, 
571 |         meta = Meta,
572 |         classes = Classes,
573 |         latencies = Latencies,
574 |         conn_id = Remote,
575 |         conn_pid = ConnPid
576 |     } = State,
577 |     case OldStatus==Status of
578 |         true -> 
579 |             State;
580 |         false -> 
581 |             case NodeId of
582 |                 <<>> ->
583 |                     ok;
584 |                 _ ->
585 |                     Update = #{
586 |                         status => Status,
587 |                         meta => Meta, 
588 |                         listen => Listen,
589 |                         latencies => Latencies,
590 |                         remote => Remote,
591 |                         conn_pid => ConnPid
592 |                     },
593 |                     nkcluster_nodes:node_update(NodeId, self(), Update),
594 |                     send_event(Classes, {nkcluster, {node_status, Status}}, State),
595 |                     ?CLLOG(info, "status changed from '~p' to '~p'", 
596 |                            [OldStatus, Status], State)
597 |             end,
598 |             State#state{status=Status, connected=Connected}
599 |     end.
600 | 
601 | 
602 | %% @private
603 | -spec send_rpc(term(), map(), {pid(), term()}|ping, #state{}) ->
604 |     {ok|error, #state{}}.
605 | 
606 | send_rpc(Msg, Opts, From, #state{rpcs=Rpcs, conn_pid=ConnPid}=State) ->
607 |     CurrentPid = case Opts of
608 |         #{conn_pid:=UserPid} -> UserPid;
609 |         _ -> ConnPid
610 |     end,
611 |     Timeout = maps:get(timeout, Opts, ?DEF_REQ_TIME),
612 |     case nkcluster_protocol:send_rpc(CurrentPid, Msg) of
613 |         {ok, TransId} ->
614 |             Timer = erlang:send_after(Timeout, self(), {req_timeout, TransId}),
615 |             Rpc = #req{from=From, timer=Timer, conn_pid=CurrentPid},
616 |             Rpcs1 = maps:put(TransId, Rpc, Rpcs),
617 |             {ok, State#state{rpcs=Rpcs1}};
618 |         {error, _Error} when CurrentPid==ConnPid ->
619 |             nkcluster_protocol:stop(CurrentPid),
620 |             {error, update_status(not_connected, false, State)};
621 |         {error, _} ->
622 |             nkcluster_protocol:stop(CurrentPid),
623 |             send_rpc(Msg, maps:remove(conn_pid, Opts), From, State)
624 |     end.
625 | 
626 | 
627 | %% @private
628 | send_event([], _Event, _State) ->
629 |     ok;
630 | 
631 | send_event([Class|Rest], Event, #state{node_id=NodeId}=State) ->
632 |     case catch Class:event(NodeId, Event) of
633 |         {'EXIT', Error} ->
634 |             ?CLLOG(warning, "error calling ~p:event/3: ~p", [Class, Error], State);
635 |         _ ->
636 |             ok
637 |     end,
638 |     send_event(Rest, Event, State).
639 | 
640 | 
641 | %% @private
642 | -spec do_connect([nklib:uri()|pid()], pid(), map()) ->
643 |     {ok, pid(), binary(), map()} | error.
644 | 
645 | do_connect([], _Host, _Opts) ->
646 |     error;
647 | 
648 | do_connect([ConnPid|Rest], Host, Opts) when is_pid(ConnPid) ->
649 |     case nkcluster_protocol:wait_auth(ConnPid) of
650 |         {ok, NodeId, Info} ->
651 |             % Probably the agent has started this connection
652 |             nkcluster_protocol:take_control(ConnPid, Host),
653 |             lager:info("NkCLUSTER proxy connected to ~p", [ConnPid]),
654 |             {ok, ConnPid, NodeId, Info};
655 |         {error, Error} ->
656 |             lager:info("NkCLUSTER proxy error connecting to ~p: ~p", [ConnPid, Error]),
657 |             do_connect(Rest, Host, Opts)
658 |     end;
659 | 
660 | do_connect([ConnUri|Rest], Host, Opts) ->
661 |     ConnOpts1 = nkcluster_agent:connect_opts({control, Host}, Host, Opts),
662 |     IdleTimeout = maps:get(idle_timeout, Opts, 3*ping_time()),
663 |     ConnOpts2 = ConnOpts1#{idle_timeout => IdleTimeout},
664 |     case nkpacket:connect(ConnUri, ConnOpts2) of
665 |         {ok, ConnPid} ->
666 |             case nkcluster_protocol:wait_auth(ConnPid) of
667 |                 {ok, NodeId, Info} ->
668 |                     ConnId = nklib_util:to_binary(ConnUri),
669 |                     lager:info("NkCLUSTER proxy connected to ~s", [ConnId]),
670 |                     {ok, ConnPid, NodeId, Info};
671 |                 {error, Error} ->
672 |                     ConnId = nklib_util:to_binary(ConnUri),
673 |                     lager:info("NkCLUSTER proxy error connecting to ~s: ~p", [ConnId, Error]),
674 |                     do_connect(Rest, Host, Opts)
675 |             end;
676 |         {error, Error} ->
677 |             ConnId = nklib_util:to_binary(ConnUri),
678 |             lager:info("NkCLUSTER proxy error connecting to ~s: ~p", [ConnId, Error]),
679 |             do_connect(Rest, Host, Opts)
680 |     end.
681 | 
682 | 
683 | %% @private
684 | -spec do_reply({pid(), term()}|ping, term()) ->
685 |     ok.
686 | 
687 | do_reply(ping, Reply) ->    
688 |     gen_server:cast(self(), {pong, Reply});
689 | 
690 | do_reply(From, Reply) ->
691 |     gen_server:reply(From, Reply).
692 | 
693 | 
694 | %% @private
695 | -spec do_call(pid(), term(), pos_integer()) ->
696 |     term().
697 | 
698 | do_call(Pid, _Msg, _Timeout)  when Pid==self() ->
699 |     {error, looped_request};
700 | 
701 | do_call(Pid, Msg, Timeout) ->
702 |     case catch gen_server:call(Pid, Msg, Timeout) of
703 |         {'EXIT', _} -> {error, worker_failed};
704 |         Other -> Other
705 |     end.
706 | 
707 | 
708 | %% @private
709 | ping_time() -> nkcluster_app:get(ping_time).
710 | 
711 | 


--------------------------------------------------------------------------------
/src/nkcluster_nodes.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc Master Management
 22 | %% One of these servers are started at each node
 23 | %% One of the is elected master, and receives all the information
 24 | -module(nkcluster_nodes).
 25 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 26 | -behaviour(nkdist_gen_server).
 27 | 
 28 | -export([get_nodes/0, get_local_nodes/0]).
 29 | -export([get_node_info/1, get_local_node_info/1, get_node_proxy/1]).
 30 | -export([rpc/3, new_connection/1, connect/2, stop/1]).
 31 | -export([node_announce/2, node_update/3]).
 32 | -export([start_link/0, init/1, terminate/2, code_change/3, handle_call/3,
 33 |          handle_cast/2, handle_info/2, handle_master/2]).
 34 | 
 35 | -export_type([info/0]).
 36 | 
 37 | 
 38 | -type update() ::
 39 |     #{
 40 |         status => nkcluster:node_status(),
 41 |         listen => [nklib:uri()],
 42 |         meta => [nklib:token()],
 43 |         stats => map()
 44 |     }.
 45 | 
 46 | 
 47 | -type info() ::
 48 |     #{
 49 |         id => nkcluster:node_id(),
 50 |         proxies => [pid()]
 51 |     }
 52 |     | update().
 53 | 
 54 | -define(TIMEOUT, 60000).
 55 | 
 56 | 
 57 | %% ===================================================================
 58 | %% Public
 59 | %% ===================================================================
 60 | 
 61 | 
 62 | %% @doc Get the current recognized nodes, calling to the master
 63 | -spec get_nodes() ->
 64 |     [nkcluster:node_id()].
 65 | 
 66 | get_nodes() ->
 67 |     nkdist_gen_server:call(?MODULE, get_nodes, ?TIMEOUT).
 68 | 
 69 | 
 70 | %% @doc Get the current recognized nodes
 71 | -spec get_local_nodes() ->
 72 |     [nkcluster:node_id()].
 73 | 
 74 | get_local_nodes() ->
 75 |     gen_server:call(?MODULE, get_nodes, ?TIMEOUT).
 76 | 
 77 | 
 78 | %% @doc Get node information from the master
 79 | -spec get_node_info(nkcluster:node_id()) ->
 80 |     {ok, info()} | {error, term()}.
 81 | 
 82 | get_node_info(NodeId) ->
 83 |     nkdist_gen_server:call(?MODULE, {get_node_info, NodeId}, ?TIMEOUT).
 84 | 
 85 | 
 86 | %% @doc Get node information
 87 | -spec get_local_node_info(nkcluster:node_id()) ->
 88 |     {ok, info()} | {error, term()}.
 89 | 
 90 | get_local_node_info(NodeId) ->
 91 |     gen_server:call(?MODULE, {get_node_info, NodeId}, ?TIMEOUT).
 92 | 
 93 | 
 94 | %% @doc Get the current pid for a node proxy
 95 | -spec get_node_proxy(nkcluster:node_id()) ->
 96 |     {ok, pid()} | {error, not_found} | {error, term()}.
 97 | 
 98 | get_node_proxy(NodeId) ->
 99 |     nkdist:find_proc(nkcluster_node_proxy, NodeId).
100 | 
101 | 
102 | %% @private Sends a remote request
103 | -spec rpc(nkcluster:node_id(), nkcluster_protocol:rpc(), 
104 |           nkcluster_node_proxy:rpc_opts()) ->
105 |     {reply, nkcluster:reply()} | {error, term()}.
106 | 
107 | rpc(NodeId, Cmd, Opts) ->
108 |     case get_node_proxy(NodeId) of
109 |         {ok, Pid} ->
110 |             nkcluster_node_proxy:rpc(Pid, Cmd, Opts);
111 |         {error, Error} ->
112 |             {error, Error}
113 |     end.
114 | 
115 | 
116 | %% @private Starts a new connection to the node
117 | -spec new_connection(nkcluster:node_id()) ->
118 |     {ok, pid()} | {error, term()}.
119 | 
120 | new_connection(NodeId) ->
121 |     case get_node_proxy(NodeId) of
122 |         {ok, Pid} ->
123 |             nkcluster_node_proxy:new_connection(Pid);
124 |         {error, Error} ->
125 |             {error, Error}
126 |     end.
127 | 
128 | 
129 | %% @doc Manually connect to a remote worker using its listening address
130 | -spec connect(nklib:user_uri(), nkcluster_agent:connect_opts()) ->
131 |     {ok, nkcluster:node_id(), map(), pid()} | {error, term()}.
132 | 
133 | connect(Uri, Opts) when is_map(Opts) ->
134 |     case nkcluster_agent:connect(Uri, Opts) of
135 |         {ok, NodeId, #{conn_pid:=ConnPid}=Info} ->
136 |             case try_connect(NodeId, ConnPid, #{}) of
137 |                 {ok, Pid} -> {ok, NodeId, Info, Pid};
138 |                 {error, Error} -> {error, Error}
139 |             end;
140 |         {error, Error} ->
141 |             {error, Error}
142 |     end.
143 | 
144 | 
145 | -spec stop(nkcluster:node_id()) ->
146 |     ok | {error, term()}.
147 | 
148 | stop(NodeId) ->
149 |     case get_node_proxy(NodeId) of
150 |         {ok, Pid} ->
151 |             nkcluster_node_proxy:stop(Pid);
152 |         {error, Error} ->
153 |             {error, Error}
154 |     end.
155 | 
156 | 
157 | %% ===================================================================
158 | %% Internal
159 | %% ===================================================================
160 | 
161 | 
162 | %% @private Called from our listening connection when a remote node announces itself
163 | -spec node_announce(nkcluster:node_id(), pid()) ->
164 |     ok.
165 | 
166 | node_announce(NodeId, ConnPid) ->
167 |     gen_server:cast(?MODULE, {node_announce, NodeId, ConnPid}).
168 | 
169 | 
170 | %% @private Called from a node proxy when it has an update about its node
171 | -spec node_update(nkcluster:node_id(), pid(), update()) ->
172 |     ok.
173 | 
174 | node_update(NodeId, ControlPid, Status) ->
175 |     gen_server:cast(?MODULE, {node_update, NodeId, ControlPid, Status}).
176 | 
177 | 
178 | 
179 | % ===================================================================
180 | %% gen_server
181 | %% ===================================================================
182 | 
183 | 
184 | %% @private
185 | start_link() ->
186 |     nkdist_gen_server:start_link(?MODULE, [], []).
187 | 
188 | 
189 | %% We can eventually have several proxied for a remote node
190 | -record(node, {
191 |     info = #{} :: map(),
192 |     pids = [] :: [pid()]
193 | }).
194 | 
195 | 
196 | -record(state, {
197 |     master :: pid() | undefined,
198 |     nodes = #{} :: #{nkcluster:node_id() => #node{}},
199 |     pids = #{} :: #{pid() => nkcluster:node_id()}
200 | }).
201 | 
202 | 
203 | %% @private 
204 | init([]) ->
205 | 	{ok, #state{}}.
206 | 
207 | 
208 | -spec handle_call(term(), {pid(), term()}, #state{}) ->
209 |     {reply, term(), #state{}} | {noreply, #state{}}.
210 | 
211 | handle_call(get_nodes, _From, #state{nodes=Nodes}=State) ->
212 |     {reply, maps:keys(Nodes), State};
213 | 
214 | handle_call({get_node_info, NodeId}, _From, #state{nodes=Nodes}=State) ->
215 |     case maps:get(NodeId, Nodes, undefined) of
216 |         #node{} = Node ->
217 |             {reply, {ok, node_to_info(NodeId, Node)}, State};
218 |         undefined ->
219 |             {reply, {error, not_found}, State}
220 |     end;
221 | 
222 | handle_call(get_state, _From, State) ->
223 |     {reply, State, State};
224 | 
225 | handle_call(Msg, _From, State) -> 
226 |     lager:error("Module ~p received unexpected call ~p", [?MODULE, Msg]),
227 |     {noreply, State}.
228 | 
229 | 
230 | %% @private
231 | -spec handle_cast(term(), #state{}) ->
232 |     {noreply, #state{}}.
233 | 
234 | handle_cast({node_announce, NodeId, ConnPid}, State) ->
235 |     lager:info("NkCLUSTER nodes manager received announce from ~s (~p)", 
236 |                [NodeId, ConnPid]),
237 |     spawn(fun() -> try_connect(NodeId, ConnPid, #{}) end),
238 |     {noreply, State};
239 | 
240 | handle_cast({node_update, NodeId, Pid, Info}, #state{master=Master}=State) ->
241 |     master_update(Master, NodeId, Pid, Info),
242 |     {noreply, do_update(NodeId, Pid, Info, State)};
243 | 
244 | handle_cast(Msg, State) -> 
245 |     lager:error("Module ~p received unexpected cast ~p", [?MODULE, Msg]),
246 |     {noreply, State}.
247 | 
248 | 
249 | %% @private
250 | -spec handle_info(term(), #state{}) ->
251 |     {noreply, #state{}}.
252 | 
253 | handle_info({'DOWN', _, process, Pid, _}, State) ->
254 |     #state{nodes=Nodes, pids=Pids} = State,
255 |     case maps:is_key(Pid, Pids) of
256 |         true ->
257 |             NodeId = maps:get(Pid, Pids),
258 |             #node{pids=NodePids} = Node = maps:get(NodeId, Nodes),
259 |             case NodePids -- [Pid] of
260 |                 [] ->
261 |                     Nodes1 = maps:remove(NodeId, Nodes),
262 |                     Pids1 = maps:remove(Pid, Pids),
263 |                     {noreply, State#state{nodes=Nodes1, pids=Pids1}};
264 |                 NodePids1 ->
265 |                     Node1 = Node#node{pids=NodePids1},
266 |                     Nodes1 = maps:update(NodeId, Node1, Nodes),
267 |                     Pids1 = maps:remove(Pid, Pids),
268 |                     {noreply, State#state{nodes=Nodes1, pids=Pids1}}
269 |             end;
270 |         false ->
271 |             lager:warning("Module ~p received unexpected 'DOWN': ~p", [?MODULE, Pid]),
272 |             {noreply, State}
273 |     end;
274 | 
275 | handle_info(Info, State) -> 
276 |     lager:warning("Module ~p received unexpected info: ~p", [?MODULE, Info]),
277 |     {noreply, State}.
278 | 
279 | 
280 | %% @private Implementing the nkdist_gen_server handle_master callback
281 | -spec handle_master(pid()|undefined, #state{}) ->
282 |     {ok, #state{}}.
283 | 
284 | handle_master(Master, State) when is_pid(Master) ->
285 |     lager:notice("NkCLUSTER nodes manager ~p master is ~p (~p)", 
286 |                  [self(), node(Master), Master]),
287 |     {ok, master_update_all(State#state{master=Master})};
288 | 
289 | handle_master(undefined, State) ->
290 |     lager:notice("NkCLUSTER nodes manager ~p master is undefined!", [self()]),
291 |     {ok, State#state{master=undefined}}.
292 | 
293 | 
294 | %% @private
295 | -spec code_change(term(), #state{}, term()) ->
296 |     {ok, #state{}}.
297 | 
298 | code_change(_OldVsn, State, _Extra) ->
299 |     {ok, State}.
300 | 
301 | 
302 | %% @private
303 | -spec terminate(term(), #state{}) ->
304 |     ok.
305 | 
306 | terminate(_Reason, _State) ->  
307 |     ok.
308 | 
309 | 
310 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Internal %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
311 | 
312 | 
313 | %% @private
314 | master_update(Master, NodeId, Pid, Info) when is_pid(Master), Master /= self() ->
315 |     gen_server:cast(Master, {node_update, NodeId, Pid, Info});
316 | 
317 | master_update(_Master, _NodeId, _Pid, _Info) ->
318 |     ok.
319 | 
320 | 
321 | %% private
322 | master_update_all(#state{master=Master, nodes=Nodes}=State) ->
323 |     lists:foreach(
324 |         fun({NodeId, #node{pids=Pids, info=Info}}) ->
325 |             lists:foreach(
326 |                 fun(Pid) -> master_update(Master, NodeId, Pid, Info) end,
327 |                 Pids)
328 |         end,
329 |         maps:to_list(Nodes)),
330 |     State.
331 | 
332 | 
333 | %% @private
334 | node_to_info(NodeId, #node{info=Info, pids=Pids}) ->
335 |     Info#{id=>NodeId, proxies=>Pids}.
336 | 
337 | 
338 | %% @private
339 | -spec do_update(nkcluster:node_id(), pid(), update(), #state{}) ->
340 |     #state{}.
341 | 
342 | do_update(NodeId, Pid, Info, #state{nodes=Nodes, pids=Pids}=State) ->
343 |     #node{info=NodeInfo, pids=NodePids} = maps:get(NodeId, Nodes, #node{}),
344 |     Info2 = maps:merge(NodeInfo, Info),    
345 |     Node2 = case lists:member(Pid, NodePids) of
346 |         true -> 
347 |             #node{info=Info2, pids=NodePids};
348 |         false -> 
349 |             #node{info=Info2, pids=[Pid|NodePids]}            
350 |     end,
351 |     Nodes2 = maps:put(NodeId, Node2, Nodes),
352 |     Pids2 = case maps:get(Pid, Pids, undefined) of
353 |         undefined ->
354 |             monitor(process, Pid),
355 |             maps:put(Pid, NodeId, Pids);
356 |         NodeId ->
357 |             Pids;
358 |         _ ->
359 |             lager:warning("NkCLUSTER nodes manager received update for OLD node"),
360 |             Pids
361 |     end,
362 |     State#state{nodes=Nodes2, pids=Pids2}.
363 | 
364 | 
365 | %% @private
366 | -spec try_connect(nkcluster:node_id(), term(), map()) ->
367 |     {ok, pid()} | {error, term()}.
368 | 
369 | try_connect(NodeId, Connect, Opts) ->
370 |     case nkdist:find_proc(nkcluster_node_proxy, NodeId) of
371 |         {ok, Pid} ->
372 |             {ok, Pid};
373 |         {error, _} ->
374 |             lager:info("NkCLUSTER nodes manager starting proxy to ~s", [NodeId]),
375 |             Arg = Opts#{connect=>Connect},
376 |             % Calls nkcluster_node_proxy:start/2
377 |             case nkdist:start_proc(nkcluster_node_proxy, NodeId, Arg) of
378 |                 {ok, Pid} ->
379 |                     {ok, Pid};
380 |                 {error, {already_started, Pid}} ->
381 |                     {ok, Pid};
382 |                 {error, Error} ->
383 |                     lager:warning("NkCLUSTER nodes manager could not start proxy: ~p", 
384 |                                   [Error]),
385 |                     {error, Error}
386 |             end
387 |     end.
388 |         
389 | 
390 | 


--------------------------------------------------------------------------------
/src/nkcluster_nodes.erl.1:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc Master Management
 22 | %% One of these servers are started at each node
 23 | %% One of the is elected master, and receives all the information
 24 | -module(nkcluster_nodes).
 25 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 26 | -behaviour(nkdist_gen_server).
 27 | 
 28 | -export([get_nodes/0, get_local_nodes/0]).
 29 | -export([get_node_info/1, get_local_node_info/1, get_node_proxy/1]).
 30 | -export([rpc/3, new_connection/1, connect/2, stop/1]).
 31 | -export([node_announce/2, node_update/3]).
 32 | -export([start_link/0, init/1, terminate/2, code_change/3, handle_call/3,
 33 |          handle_cast/2, handle_info/2, handle_master/2]).
 34 | 
 35 | -export_type([info/0]).
 36 | 
 37 | 
 38 | -type update() ::
 39 |     #{
 40 |         status => nkcluster:node_status(),
 41 |         listen => [nklib:uri()],
 42 |         meta => [nklib:token()],
 43 |         stats => map()
 44 |     }.
 45 | 
 46 | 
 47 | -type info() ::
 48 | 
 49 |     #{
 50 |         id => nkcluster:node_id(),
 51 |         proxies => [pid()],
 52 |         status => nkcluster:node_status(),
 53 |         listen => [nklib:uri()],
 54 |         meta => [nklib:token()],
 55 |         stats => map()
 56 |     }.
 57 | 
 58 | -define(TIMEOUT, 60000).
 59 | 
 60 | 
 61 | %% ===================================================================
 62 | %% Public
 63 | %% ===================================================================
 64 | 
 65 | 
 66 | %% @doc Get the current recognized nodes, calling to the master
 67 | -spec get_nodes() ->
 68 |     [nkcluster:node_id()].
 69 | 
 70 | get_nodes() ->
 71 |     nkdist_gen_server:call(?MODULE, get_nodes, ?TIMEOUT).
 72 | 
 73 | 
 74 | %% @doc Get the current recognized nodes
 75 | -spec get_local_nodes() ->
 76 |     [nkcluster:node_id()].
 77 | 
 78 | get_local_nodes() ->
 79 |     gen_server:call(?MODULE, get_nodes, ?TIMEOUT).
 80 | 
 81 | 
 82 | %% @doc Get node information from the master
 83 | -spec get_node_info(nkcluster:node_id()) ->
 84 |     {ok, info()} | {error, term()}.
 85 | 
 86 | get_node_info(NodeId) ->
 87 |     nkdist_gen_server:call(?MODULE, {get_node_info, NodeId}, ?TIMEOUT).
 88 | 
 89 | 
 90 | %% @doc Get node information
 91 | -spec get_local_node_info(nkcluster:node_id()) ->
 92 |     {ok, info()} | {error, term()}.
 93 | 
 94 | get_local_node_info(NodeId) ->
 95 |     gen_server:call(?MODULE, {get_node_info, NodeId}, ?TIMEOUT).
 96 | 
 97 | 
 98 | %% @doc Get the current pid for a node proxy
 99 | -spec get_node_proxy(nkcluster:node_id()) ->
100 |     {ok, pid()} | {error, not_found} | {error, term()}.
101 | 
102 | get_node_proxy(NodeId) ->
103 |     nkdist:find_proc(nkcluster_node_proxy, NodeId).
104 | 
105 | 
106 | %% @private Sends a remote request
107 | -spec rpc(nkcluster:node_id(), nkcluster_protocol:rpc(), 
108 |           nkcluster_node_proxy:rpc_opts()) ->
109 |     {reply, nkcluster:reply()} | {error, term()}.
110 | 
111 | rpc(NodeId, Cmd, Opts) ->
112 |     case get_node_proxy(NodeId) of
113 |         {ok, Pid} ->
114 |             nkcluster_node_proxy:rpc(Pid, Cmd, Opts);
115 |         {error, Error} ->
116 |             {error, Error}
117 |     end.
118 | 
119 | 
120 | %% @private Starts a new connection to the node
121 | -spec new_connection(nkcluster:node_id()) ->
122 |     {ok, pid()} | {error, term()}.
123 | 
124 | new_connection(NodeId) ->
125 |     case get_node_proxy(NodeId) of
126 |         {ok, Pid} ->
127 |             nkcluster_node_proxy:new_connection(Pid);
128 |         {error, Error} ->
129 |             {error, Error}
130 |     end.
131 | 
132 | 
133 | %% @doc Manually connect to a remote worker using its listening address
134 | -spec connect(nklib:user_uri(), nkcluster_agent:connect_opts()) ->
135 |     {ok, nkcluster:node_id(), map(), pid()} | {error, term()}.
136 | 
137 | connect(Uri, Opts) when is_map(Opts) ->
138 |     case nkcluster_agent:connect(Uri, Opts) of
139 |         {ok, NodeId, #{conn_pid:=ConnPid}=Info} ->
140 |             case try_connect(NodeId, ConnPid, #{}) of
141 |                 {ok, Pid} -> {ok, NodeId, Info, Pid};
142 |                 {error, Error} -> {error, Error}
143 |             end;
144 |         {error, Error} ->
145 |             {error, Error}
146 |     end.
147 | 
148 | 
149 | -spec stop(nkcluster:node_id()) ->
150 |     ok | {error, term()}.
151 | 
152 | stop(NodeId) ->
153 |     case get_node_proxy(NodeId) of
154 |         {ok, Pid} ->
155 |             nkcluster_node_proxy:stop(Pid);
156 |         {error, Error} ->
157 |             {error, Error}
158 |     end.
159 | 
160 | 
161 | %% ===================================================================
162 | %% Internal
163 | %% ===================================================================
164 | 
165 | 
166 | %% @private Called from our listening connection when a remote node announces itself
167 | -spec node_announce(nkcluster:node_id(), pid()) ->
168 |     ok.
169 | 
170 | node_announce(NodeId, ConnPid) ->
171 |     gen_server:cast(?MODULE, {node_announce, NodeId, ConnPid}).
172 | 
173 | 
174 | %% @private Called from a node proxy when it has an update about its node
175 | -spec node_update(nkcluster:node_id(), pid(), update()) ->
176 |     ok.
177 | 
178 | node_update(NodeId, ControlPid, Status) ->
179 |     gen_server:cast(?MODULE, {node_update, NodeId, ControlPid, Status}).
180 | 
181 | 
182 | 
183 | % ===================================================================
184 | %% gen_server
185 | %% ===================================================================
186 | 
187 | 
188 | %% @private
189 | start_link() ->
190 |     nkdist_gen_server:start_link(?MODULE, [], []).
191 | 
192 | 
193 | %% We can eventually have several proxied for a remote node
194 | -record(node, {
195 |     info = #{} :: map(),
196 |     pids = [] :: [pid()]
197 | }).
198 | 
199 | 
200 | -record(state, {
201 |     master :: pid() | undefined,
202 |     nodes = #{} :: #{nkcluster:node_id() => #node{}},
203 |     pids = #{} :: #{pid() => nkcluster:node_id()}
204 | }).
205 | 
206 | 
207 | %% @private 
208 | init([]) ->
209 | 	{ok, #state{}}.
210 | 
211 | 
212 | -spec handle_call(term(), {pid(), term()}, #state{}) ->
213 |     {reply, term(), #state{}} | {noreply, #state{}}.
214 | 
215 | handle_call(get_nodes, _From, #state{nodes=Nodes}=State) ->
216 |     {reply, maps:keys(Nodes), State};
217 | 
218 | handle_call({get_node_info, NodeId}, _From, #state{nodes=Nodes}=State) ->
219 |     case maps:get(NodeId, Nodes, undefined) of
220 |         #node{} = Node ->
221 |             {reply, {ok, node_to_info(NodeId, Node)}, State};
222 |         undefined ->
223 |             {reply, {error, not_found}, State}
224 |     end;
225 | 
226 | handle_call(get_state, _From, State) ->
227 |     {reply, State, State};
228 | 
229 | handle_call(Msg, _From, State) -> 
230 |     lager:error("Module ~p received unexpected call ~p", [?MODULE, Msg]),
231 |     {noreply, State}.
232 | 
233 | 
234 | %% @private
235 | -spec handle_cast(term(), #state{}) ->
236 |     {noreply, #state{}}.
237 | 
238 | handle_cast({node_announce, NodeId, ConnPid}, State) ->
239 |     lager:info("NkCLUSTER Nodes announce from ~s (~p)", [NodeId, ConnPid]),
240 |     spawn(fun() -> try_connect(NodeId, ConnPid, #{}) end),
241 |     {noreply, State};
242 | 
243 | handle_cast({node_update, NodeId, Pid, Info}, State) ->
244 |     master_update(NodeId, Pid, Info, State),
245 |     {noreply, do_update(NodeId, Pid, Info, State)};
246 | 
247 | handle_cast(Msg, State) -> 
248 |     lager:error("Module ~p received unexpected cast ~p", [?MODULE, Msg]),
249 |     {noreply, State}.
250 | 
251 | 
252 | %% @private
253 | -spec handle_info(term(), #state{}) ->
254 |     {noreply, #state{}}.
255 | 
256 | handle_info({'DOWN', _, process, Pid, _}, State) ->
257 |     #state{nodes=Nodes, pids=Pids} = State,
258 |     case maps:is_key(Pid, Pids) of
259 |         true ->
260 |             NodeId = maps:get(Pid, Pids),
261 |             #node{pids=NodePids} = Node = maps:get(NodeId, Nodes),
262 |             case NodePids -- [Pid] of
263 |                 [] ->
264 |                     Nodes1 = maps:remove(NodeId, Nodes),
265 |                     Pids1 = maps:remove(Pid, Pids),
266 |                     {noreply, State#state{nodes=Nodes1, pids=Pids1}};
267 |                 NodePids1 ->
268 |                     Node1 = Node#node{pids=NodePids1},
269 |                     Nodes1 = maps:update(NodeId, Node1, Nodes),
270 |                     Pids1 = maps:remove(Pid, Pids),
271 |                     {noreply, State#state{nodes=Nodes1, pids=Pids1}}
272 |             end;
273 |         false ->
274 |             lager:warning("Module ~p received unexpected 'DOWN': ~p", [?MODULE, Pid]),
275 |             {noreply, State}
276 |     end;
277 | 
278 | handle_info(Info, State) -> 
279 |     lager:warning("Module ~p received unexpected info: ~p", [?MODULE, Info]),
280 |     {noreply, State}.
281 | 
282 | 
283 | %% @private
284 | -spec handle_master(pid()|undefined, #state{}) ->
285 |     {ok, #state{}}.
286 | 
287 | handle_master(Master, State) when is_pid(Master) ->
288 |     lager:notice("NkCLUSTER Nodes ~p master is ~p (~p)", [self(), node(Master), Master]),
289 |     {ok, master_update_all(State#state{master=Master})};
290 | 
291 | handle_master(Master, State) ->
292 |     lager:notice("NkCLUSTER Nodes ~p master is ~p", [self(), Master]),
293 |     {ok, State#state{master=Master}}.
294 | 
295 | 
296 | %% @private
297 | -spec code_change(term(), #state{}, term()) ->
298 |     {ok, #state{}}.
299 | 
300 | code_change(_OldVsn, State, _Extra) ->
301 |     {ok, State}.
302 | 
303 | 
304 | %% @private
305 | -spec terminate(term(), #state{}) ->
306 |     ok.
307 | 
308 | terminate(_Reason, _State) ->  
309 |     ok.
310 | 
311 | 
312 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Internal %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
313 | 
314 | 
315 | %% @private
316 | master_update(NodeId, Pid, Info, #state{master=Master}=State) 
317 |         when is_pid(Master), Master /= self() ->
318 |     gen_server:cast(Master, {node_update, NodeId, Pid, Info}),
319 |     State;
320 | 
321 | master_update(_NodeId, _Pid, _Info, State) ->
322 |     State.
323 | 
324 | 
325 | %% private
326 | master_update_all(#state{nodes=Nodes}=State) ->
327 |     lists:foreach(
328 |         fun({NodeId, #node{pids=Pids, info=Info}}) ->
329 |             lists:foreach(
330 |                 fun(Pid) -> master_update(NodeId, Pid, Info, State) end,
331 |                 Pids)
332 |         end,
333 |         maps:to_list(Nodes)),
334 |     State.
335 | 
336 | 
337 | %% @private
338 | node_to_info(NodeId, #node{info=Info, pids=Pids}) ->
339 |     Info#{id=>NodeId, proxies=>Pids}.
340 | 
341 | 
342 | %% @private
343 | -spec do_update(nkcluster:node_id(), pid(), update(), #state{}) ->
344 |     #state{}.
345 | 
346 | do_update(NodeId, Pid, Info, #state{nodes=Nodes, pids=Pids}=State) ->
347 |     #node{info=NodeInfo, pids=NodePids} = maps:get(NodeId, Nodes, #node{}),
348 |     Info2 = maps:merge(NodeInfo, Info),    
349 |     Node2 = case lists:member(Pid, NodePids) of
350 |         true -> 
351 |             #node{info=Info2, pids=NodePids};
352 |         false -> 
353 |             #node{info=Info2, pids=[Pid|NodePids]}            
354 |     end,
355 |     Nodes2 = maps:put(NodeId, Node2, Nodes),
356 |     Pids2 = case maps:get(Pid, Pids, undefined) of
357 |         undefined ->
358 |             monitor(process, Pid),
359 |             maps:put(Pid, NodeId, Pids);
360 |         NodeId ->
361 |             Pids;
362 |         _ ->
363 |             lager:warning("NkCLUSTER Nodes received update for OLD node"),
364 |             Pids
365 |     end,
366 |     State#state{nodes=Nodes2, pids=Pids2}.
367 | 
368 | 
369 | %% @private
370 | -spec try_connect(nkcluster:node_id(), term(), map()) ->
371 |     {ok, pid()} | {error, term()}.
372 | 
373 | try_connect(NodeId, Connect, Opts) ->
374 |     case nkdist:find_proc(nkcluster_node_proxy, NodeId) of
375 |         {ok, Pid} ->
376 |             {ok, Pid};
377 |         {error, _} ->
378 |             lager:info("NkCLUSTER Nodes starting proxy to ~s", [NodeId]),
379 |             Arg = Opts#{connect=>Connect},
380 |             case nkdist:start_proc(nkcluster_node_proxy, NodeId, Arg) of
381 |                 {ok, Pid} ->
382 |                     {ok, Pid};
383 |                 {error, {already_started, Pid}} ->
384 |                     {ok, Pid};
385 |                 {error, Error} ->
386 |                     lager:warning("NkCLUSTER Nodes could not start proxy: ~p", [Error]),
387 |                     {error, Error}
388 |             end
389 |     end.
390 |         
391 | 
392 | 


--------------------------------------------------------------------------------
/src/nkcluster_protocol.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc Protocol behaviour
 22 | %%
 23 | %% This module implements the wire protocol for control and worker nodes
 24 | %%
 25 | -module(nkcluster_protocol).
 26 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 27 | -behaviour(nkpacket_protocol).
 28 | 
 29 | -export([wait_auth/1, set_master/2, take_control/2]).
 30 | -export([send_rpc/2, send_reply/3, send_event/2, send_announce/0, send_announce/1]).
 31 | -export([get_all/0, encode/1, stop/1]).
 32 | -export([transports/1, default_port/1, naptr/2, encode/2]).
 33 | -export([conn_init/1, conn_parse/3, conn_handle_call/4, 
 34 |          conn_handle_cast/3, conn_handle_info/3, conn_stop/3]).
 35 | 
 36 | -export_type([conn_id/0, from/0, msg/0, rpc/0, trans_id/0]).
 37 | 
 38 | -include_lib("nklib/include/nklib.hrl").
 39 | 
 40 | -type conn_id() :: pid() | nkpacket:nkport().
 41 | -type trans_id() :: pos_integer().
 42 | -type from() :: {pid(), trans_id()}.
 43 | 
 44 | -type msg() ::
 45 |     announce |
 46 |     set_master |
 47 |     {auth, map()|{error, term()}} |
 48 |     {rpc, trans_id(), rpc()} |
 49 |     {rep, trans_id(), nkcluster:reply()} |
 50 |     {ev, nkcluster:job_class(), nkcluster:event()}.
 51 | 
 52 | -type rpc() ::
 53 |     {req, nkcluster:job_class(), nkcluster:request()} |
 54 |     {tsk, nkcluster:job_class(), nkcluster:task_id(), nkcluster:task()} |
 55 |     {cmd, nkcluster:job_class(), nkcluster:task_id(), nkcluster:command()}.
 56 | 
 57 | 
 58 | 
 59 | 
 60 | -define(VSNS, [0]).                 % Supported versions
 61 | -define(MAX_TIME_DIFF, 5000).
 62 | 
 63 | %% ===================================================================
 64 | %% Public
 65 | %% ===================================================================
 66 | 
 67 | 
 68 | %% @private Perfoms a synchronous request to authenticate to the worker
 69 | -spec wait_auth(conn_id()) ->
 70 |     {ok, nkcluster:node_id(), map()} | {error, term()}.
 71 | 
 72 | wait_auth(ConnId) ->
 73 |     do_call(ConnId, wait_auth).
 74 |     
 75 | 
 76 | %% @private Sets this connection as master for this cluster
 77 | -spec set_master(conn_id(), [nklib:uri()]) ->
 78 |     ok | {error, term()}.
 79 | 
 80 | set_master(ConnId, Uris) ->
 81 |     do_call(ConnId, {send, {set_master, Uris}}).
 82 | 
 83 | 
 84 | -spec take_control(ConnPid::pid(), Proxy::pid()) ->
 85 |     ok.
 86 | 
 87 | take_control(ConnPid, Proxy) ->
 88 |     gen_server:cast(ConnPid, {take_control, Proxy}).
 89 | 
 90 | 
 91 | %% @private
 92 | -spec send_rpc(conn_id(), rpc()) ->
 93 |     {ok, trans_id()} | {error, term()}.
 94 | 
 95 | send_rpc(ConnId, Rpc) ->
 96 |     do_call(ConnId, {rpc, Rpc}).
 97 | 
 98 | 
 99 | -spec send_reply(conn_id(), trans_id(), nkcluster:reply()) ->
100 |     ok | {error, term()}.
101 | 
102 | send_reply(ConnId, TransId, Reply) ->
103 |     do_call(ConnId, {send, encode({rep, TransId, Reply})}).
104 | 
105 | 
106 | %% @private
107 | -spec send_event(nkcluster:job_class(), nkcluster:event()) ->
108 |     ok | {error, term()}.
109 | 
110 | send_event(Class, Msg) ->
111 |     send_event(get_worker_master(), Class, Msg).
112 | 
113 | 
114 | %% @doc Gets all worker 'main' connections
115 | -spec get_worker_master() ->
116 |     [pid()].
117 | 
118 | get_worker_master() ->
119 |     [Pid || {undefined, Pid} <- nklib_proc:values(nkcluster_worker_master)].
120 | 
121 | 
122 | %% @private
123 | send_event([], _Class, _Msg) ->
124 |     {error, no_connections};
125 | 
126 | send_event([Pid|Rest], Class, Msg) ->
127 |     case do_call(Pid, {send, encode({ev, Class, Msg})}) of
128 |         ok ->
129 |             ok;
130 |         {error, _} ->
131 |             send_event(Rest, Class, Msg)
132 |     end.
133 | 
134 | 
135 | %% @private
136 | -spec send_announce() ->
137 |     ok | error.
138 | 
139 | send_announce() ->
140 |     send_announce(get_all_worker()).
141 | 
142 | 
143 | %% @private
144 | send_announce([]) ->
145 |     error;
146 | 
147 | send_announce([Pid|Rest]) ->
148 |     case do_call(Pid, {send, announce}) of
149 |         ok -> ok;
150 |         _ -> send_announce(Rest)
151 |     end.
152 | 
153 | 
154 | %% @doc Gets all started connections
155 | -spec get_all() ->
156 |     [{control|worker, Node::nkcluster:node_id(), Remote::nkcluster:node_id(), pid()}].
157 | 
158 | get_all() ->
159 |     [
160 |         {Type, NodeId, Remote, Pid} || 
161 |         {{Type, NodeId, Remote}, Pid} <- nklib_proc:values(nkcluster_conn)
162 |     ].
163 | 
164 | 
165 | %% @doc Gets all worker started connections to primary nodes
166 | -spec get_all_worker() ->
167 |     [pid()].
168 | 
169 | get_all_worker() ->
170 |     [Pid || {worker, _NodeId, _Remote, Pid} <- get_all()].
171 | 
172 | 
173 | %% @private
174 | stop(Pid) ->
175 |     nkpacket_connection:stop(Pid).
176 | 
177 | 
178 | %% @private
179 | -spec encode(term()) ->
180 |     binary().
181 | 
182 | encode(Msg) ->
183 |     erlang:term_to_binary(Msg, [compressed]).
184 |     
185 | 
186 | %% ===================================================================
187 | %% Protocol callbacks
188 | %% ===================================================================
189 | 
190 | -spec transports(nklib:scheme()) ->
191 |     [nkpacket:transport()].
192 | 
193 | transports(nkcluster) ->
194 |     [tcp, tls, sctp, ws, wss].
195 | 
196 | 
197 | -spec default_port(nkpacket:transport()) ->
198 |     inet:port_number() | invalid.
199 | 
200 | default_port(tcp) -> 1972;
201 | default_port(tls) -> 1973;
202 | default_port(sctp) -> 1972;
203 | default_port(ws) -> 1974;
204 | default_port(wss) -> 1975;
205 | default_port(_) -> invalid.
206 | 
207 | 
208 | -spec naptr(nklib:scheme(), string()) ->
209 |     {ok, nkpacket:transport()} | invalid.
210 | 
211 | naptr(nkcluster, "nks+d2t") -> {ok, tls};
212 | naptr(nkcluster, "nk+d2t") -> {ok, tcp};
213 | naptr(nkcluster, "nks+d2w") -> {ok, wss};
214 | naptr(nkcluster, "nk+d2w") -> {ok, ws};
215 | naptr(nkcluster, "nk+d2s") -> {ok, sctp};
216 | naptr(_, _) -> invalid.
217 | 
218 | 
219 | -spec encode(term(), nkpacket:nkport()) ->
220 |     {ok, nkpacket:outcoming()} | continue | {error, term()}.
221 | 
222 | encode(Term, _NkPort) ->
223 |     {ok, encode(Term)}.
224 | 
225 | 
226 | 
227 | %% ===================================================================
228 | %% Connection callbacks
229 | %% ===================================================================
230 | 
231 | 
232 | -record(state, {
233 |     nkport :: nkpacket:nkport(),
234 |     type :: control | worker,
235 |     node_id :: term(),
236 |     auth = false :: boolean(),
237 |     cluster :: term(),
238 |     password :: term(),
239 |     remote_node_id :: term(),
240 |     remote_listen = [] :: [nklib:uri()],
241 |     remote_meta = [] :: [nklib:token()],
242 |     auth_froms = [] :: [{pid(), reference()}],
243 |     vsn :: term(),
244 |     pos_id :: trans_id(),
245 |     control :: pid(),
246 |     worker_master = false :: boolean()
247 | }).
248 | 
249 | 
250 | -spec conn_init(nkpacket:nkport()) ->
251 | 	{ok, #state{}}.
252 | 
253 | conn_init(NkPort) ->
254 |     NodeId = nkcluster_agent:node_id(),
255 |     {ok, #{type:=Type}=User} = nkpacket:get_user(NkPort),
256 |     State = #state{
257 |         nkport = NkPort,
258 |         node_id = NodeId,
259 |         pos_id = erlang:phash2({nklib_util:l_timestamp(), NodeId}) * 1000,
260 |         password = maps:get(password, User, undefined)
261 |     },
262 |     lager:debug("NkCLUSTER node ~s (~p) starting connection", [NodeId, Type]),
263 |     case User of
264 |         #{type:=listen} ->
265 |             % We don't know yet our type
266 |             {ok, State};
267 |         #{type:=control} ->
268 |             % Later we will call take_control
269 |             State1 = State#state{type=control},
270 |             send_auth(NkPort, State1);
271 |         #{type:={control, Pid}} when is_pid(Pid) ->
272 |             State1 = State#state{type=control, control=Pid},
273 |             send_auth(NkPort, State1);
274 |         #{type:=worker} ->
275 |             State1 = State#state{type=worker},
276 |             send_auth(NkPort, State1)
277 |     end.
278 | 
279 | 
280 | %% @private
281 | -spec conn_parse(term()|close, nkpacket:nkpacket(), #state{}) ->
282 |     {ok, #state{}} | {stop, term(), #state{}}.
283 | 
284 | conn_parse(close, _NkPort, State) ->
285 | 	{ok, State};
286 | 
287 | conn_parse({binary, WsBinary}, NkPort, State) ->
288 |     conn_parse(WsBinary, NkPort, State);
289 | 
290 | conn_parse(Data, NkPort, #state{auth=false}=State) ->
291 |     case catch binary_to_term(Data) of
292 |         {auth, Msg} ->
293 |             process_auth(Msg, NkPort, State);
294 |         Other ->
295 |             lager:warning("NkCLUSTER node received unexpected object, closing: ~p", 
296 |                           [Other]),
297 |             {stop, normal, State}
298 |     end;
299 | 
300 | conn_parse(Data, NkPort, #state{auth=true, type=worker}=State) ->
301 |     case catch binary_to_term(Data) of
302 |         {set_master, Uris} ->
303 |             ok = nkcluster_agent:update_cluster_addr(true, Uris),
304 |             State1 = case State#state.worker_master of
305 |                 true -> 
306 |                     State;
307 |                 false ->
308 |                     nklib_proc:put(nkcluster_worker_master),
309 |                     State#state{worker_master=true}
310 |             end,
311 |             {ok, State1};
312 |         {rpc, TransId, {req, nkcluster, {ping, Time}}} ->
313 |             nkcluster_agent:received_ping(),
314 |             Reply = {reply, {Time, nklib_util:l_timestamp()}},
315 |             ret_send({rep, TransId, Reply}, NkPort, State);
316 |         {rpc, TransId, Rpc} ->
317 |             case process_rpc(Rpc, {self(), TransId}) of
318 |                 defer -> {ok, State};
319 |                 Reply -> ret_send({rep, TransId, Reply}, NkPort, State)
320 |             end;
321 |         Other ->
322 |             lager:warning("NkCLUSTER Worker received unexpected object, "
323 |                           "closing: ~p", [Other]),
324 |             {stop, normal, State}
325 |     end;
326 | 
327 | conn_parse(Data, _NkPort, #state{auth=true, type=control}=State) ->
328 |     case catch binary_to_term(Data) of
329 |         announce ->
330 |             #state{remote_node_id=RemNodeId} = State,
331 |             nkcluster_nodes:node_announce(RemNodeId, self()),
332 |             {ok, State};
333 |         {rep, TransId, Reply} ->
334 |             process_resp(TransId, Reply, State);
335 |         {ev, Class, Event} ->
336 |             process_event(Class, Event, State);
337 |         Other ->
338 |             lager:warning("NkCLUSTER Control received unexpected object, "
339 |                           "closing: ~p", [Other]),
340 |             {stop, normal, State}
341 |     end.
342 | 
343 | 
344 | %% @private
345 | -spec conn_handle_call(term(), {pid(), term()}, nkpacket:nkpacket(), #state{}) ->
346 |     {ok, #state{}} | {stop, term(), #state{}}.
347 | 
348 | conn_handle_call(wait_auth, From, NkPort, #state{auth=true}=State) ->
349 |     gen_server:reply(From, get_remote(NkPort, State)),
350 |     {ok, State};
351 | 
352 | conn_handle_call(wait_auth, From, _NkPort, State) ->
353 |     #state{auth_froms=Froms} = State,
354 |     {ok, State#state{auth_froms=[From|Froms]}};
355 | 
356 | conn_handle_call({rpc, Rpc}, From, NkPort, #state{pos_id=TransId}=State) ->
357 |     do_send_rpc({rpc, TransId, Rpc}, From, NkPort, State);
358 | 
359 | conn_handle_call({send, Msg}, From, NkPort, #state{auth=true}=State)->
360 |     ret_send2(Msg, From, NkPort, State);
361 | 
362 | conn_handle_call({send, _Msg}, From, _NkPort, State) ->
363 |     gen_server:reply(From, {error, not_authenticated}),
364 |     {ok, State};
365 | 
366 | conn_handle_call(Msg, _From, _NkPort, State) ->
367 |     lager:error("Module ~p received unexpected call: ~p", [?MODULE, Msg]),
368 |     {stop, unexpected_call, State}.
369 | 
370 | 
371 | %% @private
372 | -spec conn_handle_cast(term(), nkpacket:nkpacket(), #state{}) ->
373 |     {ok, #state{}} | {stop, term(), #state{}}.
374 | 
375 | conn_handle_cast({take_control, Proxy}, _NkPort, #state{type=control}=State) ->
376 |     nkpacket_connection:update_monitor(self(), Proxy),
377 |     {ok, State#state{control=Proxy}};
378 | 
379 | conn_handle_cast(Msg, _NkPort, State) ->
380 |     lager:error("Module ~p received unexpected cast: ~p", [?MODULE, Msg]),
381 |     {ok, State}.
382 | 
383 | 
384 | %% @private
385 | -spec conn_handle_info(term(), nkpacket:nkpacket(), #state{}) ->
386 |     {ok, #state{}} | {stop, term(), #state{}}.
387 | 
388 | % nkcluster_jobs launchs processes with start_link
389 | conn_handle_info({'EXIT', _, normal}, _NkPort, State) ->
390 |     {ok, State};
391 | 
392 | conn_handle_info(Msg, _NkPort, State) ->
393 |     lager:info("Module ~p received unexpected info: ~p", [?MODULE, Msg]),
394 |     {ok, State}.
395 | 
396 | 
397 | %% @doc Called when the connection stops
398 | -spec conn_stop(Reason::term(), nkpacket:nkpacket(), #state{}) ->
399 |     ok.
400 | 
401 | conn_stop(_Reason, NkPort, #state{type=Type}) ->
402 |     lager:info("NkCLUSTER node (~p) disconnected from ~s", [Type, get_remote_id(NkPort)]).
403 | 
404 | 
405 | %% ===================================================================
406 | %% Internal
407 | %% ===================================================================
408 | 
409 | %% @private
410 | -spec do_call(pid()|nkpacket:nkport(), term()) ->
411 |     term().
412 | 
413 | do_call(Id, Msg) ->
414 |     Pid = nkpacket:pid(Id),
415 |     case catch gen_server:call(Pid, Msg, 5*60*1000) of
416 |         {'EXIT', Error} -> {error, {process_failed, Error}};
417 |         Other -> Other
418 |     end.
419 | 
420 | 
421 | %% @private
422 | send_auth(NkPort, #state{node_id=NodeId, type=Type}=State) ->
423 |     AuthMsg = #{
424 |         stage => 1, 
425 |         vsns => ?VSNS, 
426 |         id => NodeId,
427 |         cluster => nkcluster_app:get(cluster_name),
428 |         time => nklib_util:l_timestamp() div 1000,
429 |         type => Type
430 |     },
431 |     case raw_send({auth, AuthMsg}, NkPort) of
432 |         ok ->
433 |             {ok, State};
434 |         error ->
435 |             {stop, connection_error}
436 |     end.
437 | 
438 | 
439 | %% @private
440 | do_send_rpc(Msg, From, NkPort, #state{auth=true, pos_id=TransId}=State) ->
441 |     case raw_send(Msg, NkPort) of
442 |         ok ->
443 |             gen_server:reply(From, {ok, TransId}),
444 |             {ok, State#state{pos_id=TransId+1}};
445 |         error ->
446 |             gen_server:reply(From, {ok, connection_error}),
447 |             {stop, normal, State}
448 |     end;
449 | 
450 | do_send_rpc(_Msg, From, _NkPort, State) ->
451 |     gen_server:reply(From, {error, not_authenticated}),
452 |     {ok, State}.
453 | 
454 | 
455 | 
456 | %% @private
457 | %% Processed at the listening side
458 | process_auth(#{stage:=1, vsns:=Vsns, id:=RemNodeId, cluster:=Cluster, 
459 |                type:=Type, time:=Time}, NkPort, State) ->
460 |     case select_vsn(Vsns) of
461 |         error ->
462 |             not_authorized(unsupported_version, NkPort, State);
463 |         Vsn ->
464 |             Now = nklib_util:l_timestamp() div 1000,
465 |             Drift = abs(Now-Time),
466 |             case Drift > ?MAX_TIME_DIFF of
467 |                 true ->
468 |                     lager:warning("NkCLUSTER node big time drift: ~p", [Drift]);
469 |                     % not_authorized(time_drift, State);
470 |                 false ->
471 |                     ok
472 |             end,
473 |             Hash = make_auth_hash(RemNodeId, NkPort, State),
474 |             #state{node_id=NodeId} = State,
475 |             Stage2 = #{
476 |                 stage => 2, 
477 |                 id => NodeId, 
478 |                 vsn => Vsn, 
479 |                 hash => Hash
480 |             },
481 |             OurType = case Type of
482 |                 control -> worker;
483 |                 worker -> control
484 |             end,
485 |             State1 = State#state{
486 |                 type = OurType, 
487 |                 cluster = Cluster,
488 |                 remote_node_id = RemNodeId, 
489 |                 vsn=Vsn
490 |             },
491 |             ret_send({auth, Stage2}, NkPort, State1)
492 |     end;
493 | 
494 | %% Processed at the connecting side
495 | process_auth(#{stage:=2, id:=ListenId, vsn:=Vsn, hash:=Hash}, NkPort, State) ->
496 |     true = lists:member(Vsn, ?VSNS),
497 |     #state{node_id=NodeId} = State,
498 |     case make_auth_hash(NodeId, NkPort, State) of
499 |         Hash ->
500 |             % The remote (listening) side has a valid password
501 |             % We send listen and meta, and try to authenticated ourselves
502 |             State1 = State#state{remote_node_id=ListenId},
503 |             Base3 = #{
504 |                 stage => 3, 
505 |                 hash => make_auth_hash(ListenId, NkPort, State),
506 |                 listen => nkcluster_app:get(listen),
507 |                 meta => nkcluster_app:get(meta)
508 |             },
509 |             Stage3 = case nkcluster_agent:is_primary() of
510 |                 true -> Base3#{primary_nodes=>riak_core_node_watcher:nodes(nkdist)};
511 |                 false -> Base3
512 |             end,
513 |             ret_send({auth, Stage3}, NkPort, State1); 
514 |         _ ->
515 |             not_authorized(invalid_password, NkPort, State)
516 |     end;
517 | 
518 | %% Processed at the listening side again
519 | process_auth(#{stage:=3, listen:=Listen, meta:=Meta, hash:=Hash}=Msg, NkPort, State) ->
520 |     #state{node_id=ListenId, type=Type} = State,
521 |     case make_auth_hash(ListenId, NkPort, State) of
522 |         Hash ->
523 |             % The remote (connecting) side has a valid password
524 |             % We send listen and meta
525 |             lager:info("NkCLUSTER node (~p) connected to ~s", 
526 |                        [Type, get_remote_id(NkPort)]),
527 |             State1 = State#state{
528 |                 remote_listen = Listen,
529 |                 remote_meta = Meta,
530 |                 auth = true
531 |             },
532 |             register(State1),
533 |             join_nodes(Msg),
534 |             Base4 = #{
535 |                 stage => 4,
536 |                 listen => nkcluster_app:get(listen),
537 |                 meta => nkcluster_app:get(meta)
538 |             },
539 |             Stage4 = case nkcluster_agent:is_primary() of
540 |                 true -> Base4#{primary_nodes=>riak_core_node_watcher:nodes(nkdist)};
541 |                 false -> Base4
542 |             end,
543 |             ret_send({auth, Stage4}, NkPort, State1);
544 |         _ ->
545 |             not_authorized(invalid_password, NkPort, State)
546 |     end;
547 |     
548 | %% Processed at the connecting side again, both sides are autenticated
549 | process_auth(#{stage:=4, listen:=Listen, meta:=Meta}=Msg, NkPort, State) ->
550 |     State2 = State#state{
551 |         remote_listen = Listen,
552 |         remote_meta = Meta
553 |     },
554 |     #state{type=Type} = State,
555 |     lager:info("NkCLUSTER node (~p) connected to ~s", [Type, get_remote_id(NkPort)]),
556 |     join_nodes(Msg),
557 |     register(State2),
558 |     #state{auth_froms=AuthFroms} = State,
559 |     lists:foreach(
560 |         fun(From) -> gen_server:reply(From, get_remote(NkPort, State2)) end,
561 |         AuthFroms),
562 |     {ok, State2#state{auth=true, auth_froms=[]}};
563 | 
564 | process_auth({error, Error}, NkPort, #state{node_id=NodeId}=State) ->
565 |     lager:notice("NkCLUSTER node ~s authentication error: ~p", [NodeId, Error]),
566 |     not_authorized(Error, NkPort, State).
567 | 
568 | 
569 | %% @private
570 | process_rpc({req, Class, Req}, From) ->
571 |     nkcluster_jobs:request(Class, Req, From);
572 | 
573 | process_rpc({tsk, Class, TaskId, Spec}, From) ->
574 |     nkcluster_jobs:task(Class, TaskId, Spec, From);
575 | 
576 | process_rpc({cmd, Class, TaskId, Cmd}, From) ->
577 |     nkcluster_jobs:command(Class, TaskId, Cmd, From).
578 | 
579 | 
580 | %% @private
581 | process_resp(TransId, Msg, #state{control=Pid}=State) ->
582 |     nkcluster_node_proxy:received_resp(Pid, TransId, Msg),
583 |     {ok, State}.
584 | 
585 | %% @private
586 | process_event(Class, Event, #state{control=Pid}=State) ->
587 |     nkcluster_node_proxy:received_event(Pid, Class, Event),
588 |     {ok, State}.
589 | 
590 | 
591 | %% @private
592 | -spec ret_send2(msg()|binary(), {pid(), term()}, nkpacket:nkport(), #state{}) ->
593 |     {ok, #state{}} | {stop, term(), #state{}}.
594 | 
595 | ret_send2(Msg, From, NkPort, State) ->
596 |     case raw_send(Msg, NkPort) of
597 |         ok ->
598 |             gen_server:reply(From, ok),
599 |             {ok, State};
600 |         error ->
601 |             gen_server:reply(From, {error, connection_error}),
602 |             {stop, normal, State}
603 |     end.
604 | 
605 | 
606 | %% @private
607 | -spec ret_send(msg()|binary(), nkpacket:nkport(), #state{}) ->
608 |     {ok, #state{}} | {stop, term(), #state{}}.
609 | 
610 | ret_send(Msg, NkPort, State) ->
611 |     case raw_send(Msg, NkPort) of
612 |         ok ->
613 |             {ok, State};
614 |         error ->
615 |             {stop, normal, State}
616 |     end.
617 | 
618 | 
619 | %% @private
620 | -spec raw_send(msg()|binary(), nkpacket:nkport()) ->
621 |     ok | error.
622 | 
623 | raw_send(Msg, NkPort) when is_binary(Msg) ->
624 |     case nkpacket_connection_lib:raw_send(NkPort, Msg) of
625 |         ok ->
626 |             ok;
627 |         {error, closed} ->
628 |             error;
629 |         {error, Error} ->
630 |             lager:notice("NkCLUSTER node error sending ~p: ~p", [Msg, Error]),
631 |             error
632 |     end;
633 | 
634 | raw_send(Msg, NkPort) ->
635 |     raw_send(encode(Msg), NkPort).
636 | 
637 | 
638 | %% @private
639 | select_vsn(Vsns) -> 
640 |     case sort_vsns(Vsns, []) of
641 |         [Vsn|_] -> Vsn;
642 |         [] -> error
643 |     end.
644 | 
645 | 
646 | %% @private
647 | sort_vsns([], Acc) ->
648 |     lists:reverse(lists:sort(Acc));
649 | 
650 | sort_vsns([Vsn|Rest], Acc) ->
651 |     case lists:member(Vsn, ?VSNS) of
652 |         true -> sort_vsns(Rest, [Vsn|Acc]);
653 |         false -> sort_vsns(Rest, Acc)
654 |     end.
655 | 
656 | 
657 | %% @private
658 | make_auth_hash(Salt, NkPort, #state{password=Pass}) ->
659 |     Pass2 = case Pass of
660 |         undefined -> nkcluster_app:get(password);
661 |         _ -> Pass
662 |     end,
663 |     {ok, {_Proto, Transp, _, _}} = nkpacket:get_local(NkPort),
664 |     case Transp==tls orelse Transp==wss of
665 |         true ->
666 |             Pass2;
667 |         false ->
668 |             pbkdf2(Pass2, Salt)
669 |     end.
670 | 
671 | 
672 | %% @private
673 | register(#state{type=Type, node_id=NodeId, remote_node_id=RemNodeId}) ->
674 |     nklib_proc:put(nkcluster_conn, {Type, NodeId, RemNodeId}).
675 | 
676 | 
677 | %% @private
678 | not_authorized(Error, NkPort, #state{auth_froms=AuthFroms}=State) ->
679 |     lists:foreach(
680 |         fun(From) -> gen_server:reply(From, {error, Error}) end,
681 |         AuthFroms),
682 |     case Error of
683 |         invalid_password -> timer:sleep(500);
684 |         _ -> ok
685 |     end,
686 |     raw_send({auth, {error, Error}}, NkPort),
687 |     {stop, normal, State}.
688 | 
689 | 
690 | %% @private
691 | get_remote(NkPort, State) ->
692 |     #state{
693 |         remote_node_id = NodeId, 
694 |         remote_listen = Listen, 
695 |         remote_meta = Meta
696 |     } = State,
697 |     Remote = get_remote_id(NkPort),
698 |     Status = nkcluster_agent:get_status(),
699 |     {ok, NodeId, #{status=>Status, listen=>Listen, meta=>Meta, remote=>Remote}}.
700 | 
701 | 
702 | %% @private
703 | get_remote_id(NkPort) ->
704 |     {ok, {_, Transp, Ip, Port}} = nkpacket:get_remote(NkPort),
705 |     nklib_util:bjoin([Transp, nklib_util:to_host(Ip), Port], <<":">>).
706 | 
707 | 
708 | %% @private
709 | join_nodes(#{primary_nodes:=Nodes}) ->
710 |     nkcluster_agent:join_nodes(Nodes);
711 | join_nodes(_) ->
712 |     ok.
713 | 
714 | 
715 | %% @private
716 | pbkdf2(Pass, Salt) ->
717 |     Iters = nkcluster_app:get(pbkdf2_iters),
718 |     {ok, Hash} = pbkdf2:pbkdf2(sha, Pass, Salt, Iters),
719 |     Hash.
720 | 
721 | 


--------------------------------------------------------------------------------
/src/nkcluster_sup.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @private NkCLUSTER  main supervisor
 22 | -module(nkcluster_sup).
 23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 24 | -behaviour(supervisor).
 25 | 
 26 | -export([init/1, start_link/0]).
 27 | % -export([start_tasks_sup/0]).
 28 | 
 29 | -include("nkcluster.hrl").
 30 | 
 31 | %% @private
 32 | -spec start_link() ->
 33 |     {ok, pid()}.
 34 | 
 35 | start_link() ->
 36 |     Primary = case nkcluster_agent:is_primary() of
 37 |         true -> 
 38 |             [
 39 |                 {nkcluster_nodes,
 40 |                     {nkcluster_nodes, start_link, []},
 41 |                     permanent,
 42 |                     5000,
 43 |                     worker,
 44 |                     [nkcluster_nodes]
 45 |                 }
 46 |             ];
 47 |         false ->
 48 |             []
 49 |     end,
 50 |     ListenOpts1 = #{
 51 |         srv_id => nkcluster,
 52 |         idle_timeout => 180000,
 53 |         tcp_packet => 4,
 54 |         ws_proto => nkcluster,
 55 |         user => #{type=>listen}
 56 |     },
 57 |     ListenOpts2 = maps:merge(ListenOpts1, nkcluster_app:get(tls_opts)),
 58 |     ListenSpecs = lists:map(
 59 |         fun(Uri) ->
 60 |             {ok, Spec} = nkpacket:get_listener(Uri, ListenOpts2),
 61 |             Spec
 62 |         end,
 63 |         nkcluster_app:get(listen)),
 64 |     ChildsSpec = 
 65 |         Primary
 66 |         ++
 67 |         [
 68 |             {nkcluster_agent,
 69 |                 {nkcluster_agent, start_link, []},
 70 |                 permanent,
 71 |                 5000,
 72 |                 worker,
 73 |                 [nkcluster_agent]
 74 |             },
 75 |             {nkcluster_jobs,
 76 |                 {nkcluster_jobs, start_link, []},
 77 |                 permanent,
 78 |                 5000,
 79 |                 worker,
 80 |                 [nkcluster_jobs]
 81 |             }
 82 |         ] 
 83 |         ++ 
 84 |         ListenSpecs,
 85 |     supervisor:start_link({local, ?MODULE}, ?MODULE, {{one_for_one, 10, 60}, 
 86 |                           ChildsSpec}).
 87 | 
 88 | 
 89 | %% @private
 90 | init(ChildSpecs) ->
 91 |     {ok, ChildSpecs}.
 92 | 
 93 | 
 94 | % %% @private
 95 | % start_tasks_sup() ->
 96 | %     supervisor:start_link({local, nkcluster_tasks_sup}, 
 97 | %                           ?MODULE, {{one_for_one, 10, 60}, []}).
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/src/nkcluster_syntax.erl:
--------------------------------------------------------------------------------
 1 | %% -------------------------------------------------------------------
 2 | %%
 3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
 4 | %%
 5 | %% This file is provided to you under the Apache License,
 6 | %% Version 2.0 (the "License"); you may not use this file
 7 | %% except in compliance with the License.  You may obtain
 8 | %% a copy of the License at
 9 | %%
10 | %%   http://www.apache.org/licenses/LICENSE-2.0
11 | %%
12 | %% Unless required by applicable law or agreed to in writing,
13 | %% software distributed under the License is distributed on an
14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | %% KIND, either express or implied.  See the License for the
16 | %% specific language governing permissions and limitations
17 | %% under the License.
18 | %%
19 | %% -------------------------------------------------------------------
20 | 
21 | -module(nkcluster_syntax).
22 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
23 | 
24 | -export([app_syntax/0, app_defaults/0]).
25 | 
26 | -include_lib("nkpacket/include/nkpacket.hrl").
27 | 
28 | %% ===================================================================
29 | %% Private
30 | %% ===================================================================
31 | 
32 | app_syntax() -> 
33 |     #{
34 |         cluster_name => binary,
35 |         cluster_addr => uris,
36 |         password => binary,
37 |         meta => tokens,
38 |         type => {enum, [primary, secondary]},
39 |         listen => uris,
40 |         ?TLS_SYNTAX,
41 |         ping_time => {integer, 1000, 60000},          % Ping interval for agent and proxy
42 |         proxy_connect_retry => {integer, 1000, none}, % After faillure
43 |         stats_time => {integer, 1000, none},          % Agent generation
44 |         node_id => binary,                            % Force node id
45 |         staged_joins => boolean,                      % For riak_core
46 |         pbkdf2_iters => {integer, 1, none}            % Password hash
47 |     }.
48 | 
49 | 
50 | app_defaults() ->
51 |     #{
52 |         cluster_name => "nkcluster",
53 |         cluster_addr => "",
54 |         password => "nkcluster",
55 |         meta => "",
56 |         type => primary,
57 |         listen => "nkcluster://all;transport=tls",
58 |         ping_time => 5000,
59 |         proxy_connect_retry => 10000,
60 |         stats_time => 10000,
61 |         staged_joins => false,
62 |         pbkdf2_iters => 20000
63 |     }.
64 |     


--------------------------------------------------------------------------------
/test/app.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |         {cluster_name, test},
 4 |         {cluster_addr, ""},
 5 |         {type, primary},
 6 |         {meta, "test1"},
 7 |         {password, testpass},
 8 |         {listen, 
 9 |             "nkcluster://localhost:15001, 
10 |              nkcluster://localhost:15002;transport=tls,
11 |              nkcluster://localhost:15003;transport=ws,
12 |              nkcluster://localhost:15004;transport=wss"},
13 |         {stats_time, 2000},
14 |         {pbkdf2_iters, 1}
15 |     ]},
16 | 
17 |     {lager, [
18 |         {handlers, [
19 |             {lager_console_backend, info},
20 |             {lager_file_backend, [{file, "dev/1/log/error.log"}, {level, error}]},
21 |             {lager_file_backend, [{file, "dev/1/log/console.log"}, {level, info}]}
22 |         ]},
23 |         {error_logger_redirect, false},
24 |         {crash_log, "log/crash.log"},
25 |         {colored, true},
26 |         {colors, [
27 |             {debug,     "\e[0;38m" },
28 |             {info,      "\e[0;32m" },
29 |             {notice,    "\e[1;36m" },
30 |             {warning,   "\e[1;33m" },
31 |             {error,     "\e[1;31m" }
32 |         ]}
33 |     ]},
34 | 
35 |     {sasl, [
36 |         {sasl_error_logger, false}
37 |     ]},
38 | 
39 |     {riak_core, [
40 |         {schema_dirs, ["util", "../util"]},
41 |         {enable_consensus, false},
42 | 
43 | 
44 |         %% Cluster name
45 |         {cluster_name, "default"},
46 | 
47 |         %% Default location for ring, cluster and other data files
48 |         {platform_data_dir, "data"},
49 | 
50 |         %% Default ring creation size.  Make sure it is a power of 2,
51 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
52 |         {ring_creation_size, 8},
53 | 
54 |         %% Default gossip interval (milliseconds)
55 |         {gossip_interval, 60000},
56 | 
57 |         %% Target N value
58 |         {target_n_val, 4},
59 | 
60 |         %% Default claims functions
61 |         {wants_claim_fun, {riak_core_claim, default_wants_claim}},
62 |         {choose_claim_fun, {riak_core_claim, default_choose_claim}},
63 | 
64 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
65 |         %% should return their data) in ms.
66 |         {vnode_inactivity_timeout, 10000},
67 | 
68 |         %% Number of VNodes allowed to do handoff concurrently.
69 |         {handoff_concurrency, 2},
70 | 
71 |         %% Disable Nagle on HTTP sockets
72 |         {disable_http_nagle, true},
73 | 
74 |         %% Handoff IP/port
75 |         {handoff_port, 8101},
76 |         {handoff_ip, "0.0.0.0"},
77 | 
78 |         %% Disterl buffer sizes in bytes.
79 |         %% These sizes (3*128*1024 & 6*128*1024) were
80 |         %% derived from a limited amount of testing in a 
81 |         %% 10GE environment, and may need tuning for your 
82 |         %% network and workload. In particular they're likely
83 |         %% too small to be optimal for larger object sizes.
84 |         {dist_send_buf_size, 393216},
85 |         {dist_recv_buf_size, 786432}
86 |     ]}
87 | ].
88 | 


--------------------------------------------------------------------------------
/test/basic_test.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | -module(basic_test).
 22 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 23 | 
 24 | -compile([export_all]).
 25 | -include_lib("nklib/include/nklib.hrl").
 26 | -include_lib("eunit/include/eunit.hrl").
 27 | 
 28 | -define(RECV(Term), receive Term -> Term after 1000 -> error(?LINE) end).
 29 | 
 30 | -define(EVENT(NodeId), event(NodeId, ?LINE)).
 31 | 
 32 | basic_test_() ->
 33 |   	{setup, 
 34 |     	fun() -> 
 35 |     		ok = nkcluster_app:start()
 36 | 		end,
 37 | 		fun(_) -> 
 38 | 			ok 
 39 | 		end,
 40 | 	    fun(_) ->
 41 | 		    [
 42 | 				fun() -> connect() end,
 43 | 				{timeout, 10000, fun() -> proxy() end}
 44 | 			]
 45 | 		end
 46 |   	}.
 47 | 
 48 | 
 49 | 
 50 | 
 51 | connect() ->
 52 | 	?debugMsg("Starting CONNECT test"),
 53 | 	true = nkcluster_agent:is_primary(),
 54 | 	ready = nkcluster_agent:get_status(),
 55 | 	NodeId = nkcluster_agent:node_id(),
 56 | 	{ok, []} = nkdist:get_procs(),
 57 | 	{error, {invalid_scheme, http}} = nkcluster_agent:connect("http://localhost", #{}),
 58 | 	{error, {invalid_transport, udp}} = nkcluster_agent:connect("nkcluster://localhost;transport=udp", #{}),
 59 | 
 60 | 	{error, no_connections} = nkcluster_agent:connect("nkcluster://localhost", #{}),
 61 | 
 62 | 	% We start a raw connection, no announce is sent
 63 | 	% We use the local stored password
 64 | 	{ok, NodeId, Info} = 
 65 | 		nkcluster_agent:connect("nkcluster://localhost, nkcluster://localhost:15001", #{}),
 66 | 	#{
 67 | 		conn_pid:=ConnPid1, listen:=[#uri{}, #uri{}, #uri{}, #uri{}], meta:=[{<<"test1">>, []}],
 68 | 		remote:=<<"tcp:127.0.0.1:15001">>, status:=ready
 69 | 	} = Info,
 70 | 	nkcluster_protocol:stop(ConnPid1),
 71 | 
 72 | 	{error, no_connections} = nkcluster_agent:connect("nkcluster://localhost:15001", #{password=><<"bad">>}),
 73 | 	{ok, NodeId, Info2} = nkcluster_agent:connect("nkcluster://localhost:15001", #{password=><<"testpass">>}),
 74 | 	nkcluster_protocol:stop(maps:get(conn_pid, Info2)),
 75 | 
 76 | 	{error, no_connections} = nkcluster_agent:connect("nkcluster://localhost:15001;password=bad", #{}),
 77 | 	{ok, NodeId, Info3} = nkcluster_agent:connect(
 78 | 		"nkcluster://localhost:15001;password=bad, nkcluster://localhost:15001;password=testpass", #{}),
 79 | 	nkcluster_protocol:stop(maps:get(conn_pid, Info3)),
 80 | 
 81 | 	{ok, NodeId, Info4} = nkcluster_agent:connect("nkcluster://localhost:15002;transport=tls", #{}),
 82 | 	#{remote:=<<"tls:127.0.0.1:15002">>} = Info4,
 83 | 	nkcluster_protocol:stop(maps:get(conn_pid, Info4)),
 84 | 
 85 | 	% We must include a cacertfile to verify
 86 | 	{error, no_connections} = nkcluster_agent:connect("nkcluster://localhost:15002;transport=tls", 											  #{tls_verify=>true}),
 87 | 	{error, {syntax_error, <<"tls_verify">>}} = nkcluster_agent:connect(
 88 | 		"nkcluster://localhost:15002;transport=tls;tls_verify=hi", #{}),
 89 | 	{error, no_connections} = nkcluster_agent:connect(
 90 | 		"nkcluster://localhost:15002;transport=tls;tls_verify=true", #{}),
 91 | 
 92 | 	{ok, NodeId, Info5} = nkcluster_agent:connect("nkcluster://localhost:15003;transport=ws", #{}),
 93 | 	#{remote:=<<"ws:127.0.0.1:15003">>} = Info5,
 94 | 	nkcluster_protocol:stop(maps:get(conn_pid, Info5)),
 95 | 
 96 | 	{ok, NodeId, Info6} = nkcluster_agent:connect("nkcluster://localhost:15004;transport=wss", #{}),
 97 | 	#{remote:=<<"wss:127.0.0.1:15004">>} = Info6,
 98 | 	nkcluster_protocol:stop(maps:get(conn_pid, Info6)),
 99 | 	timer:sleep(500),
100 | 	ok.
101 | 
102 | 
103 | proxy() ->
104 | 	?debugMsg("Starting PROXY test"),
105 | 	[] = nkcluster_nodes:get_nodes(),
106 | 
107 | 	% We connect from the control cluster to a known node
108 | 	{ok, NodeId, Info, Proxy1} = nkcluster_nodes:connect("nkcluster://localhost:15001", #{}),
109 | 	#{
110 | 		conn_pid:=_, listen:=[#uri{}, #uri{}, #uri{}, #uri{}], meta:=[{<<"test1">>, []}],
111 | 		remote:=<<"tcp:127.0.0.1:15001">>, status:=ready
112 | 	} = Info,
113 | 	timer:sleep(100),
114 | 	[NodeId] = nkcluster_nodes:get_nodes(),
115 | 	{ok, Info1} = nkcluster_nodes:get_node_info(NodeId),
116 | 	[NodeId] = nkcluster_nodes:get_local_nodes(),
117 | 	{ok, Info1} = nkcluster_nodes:get_local_node_info(NodeId),
118 | 	#{
119 | 		id := NodeId,
120 | 		listen := [#uri{}, #uri{}, #uri{}, #uri{}],
121 | 		meta := [{<<"test1">>, []}],
122 | 		proxies := [Proxy1],
123 | 		status := ready
124 | 	} = Info1,
125 | 
126 | 	% Now we stop the node
127 | 	% However, the proxy has registered its address with the worker, and it will
128 | 	% connect again
129 | 	ok = nkcluster_nodes:stop(NodeId),
130 | 	timer:sleep(100),
131 | 	?debugMsg("waiting for Agent to reconnect..."),
132 | 	ok = wait_agent(NodeId, nklib_util:timestamp()+30),
133 | 	?debugMsg("...reconected!"),
134 | 	{ok, #{proxies:=[Proxy2]}} = nkcluster_nodes:get_node_info(NodeId),
135 | 	{ok, [{NodeId, Proxy2}]} = nkdist:get_procs(nkcluster_node_proxy),
136 | 	false = is_process_alive(Proxy1),
137 | 	true = Proxy1 /= Proxy2,
138 | 
139 | 	% Now we kill the proxy process
140 | 	exit(Proxy2, kill),
141 | 	timer:sleep(100),
142 | 	?debugMsg("waiting for Agent to reconnect..."),
143 | 	ok = wait_agent(NodeId, nklib_util:timestamp()+30),
144 | 	?debugMsg("...reconected!"),
145 | 	{ok, #{proxies:=[Proxy3]}} = nkcluster_nodes:get_node_info(NodeId),
146 | 	{ok, [{NodeId, Proxy3}]} = nkdist:get_procs(nkcluster_node_proxy),
147 | 	true = Proxy2 /= Proxy3,
148 | 
149 | 	% Now we kill the connection. The proxy should reconnect inmediatly
150 | 	% Register our class for notifications
151 | 	ok = nklib_config:put(nkcluster_test, pid, self()),
152 | 	{module, _} = code:ensure_loaded(test_job_class),
153 | 	nkcluster_jobs:send_event(test_job_class, hi),
154 | 
155 | 	hi = ?EVENT(NodeId),
156 | 	[{_, ConnPid}] = nklib_proc:values(nkcluster_worker_master),
157 | 	exit(ConnPid, kill),
158 | 	{nkcluster, {node_status, not_connected}} = ?EVENT(NodeId),
159 | 	{nkcluster, {node_status, ready}} = ?EVENT(NodeId),
160 | 	timer:sleep(1000),
161 | 
162 | 	% Stop the node
163 | 	nkcluster_agent:clear_cluster_addr(),
164 | 	nkcluster_nodes:stop(NodeId),
165 | 	timer:sleep(500),
166 | 	{nkcluster, {node_status, not_connected}} = ?EVENT(NodeId),
167 | 	[] = nkcluster_nodes:get_nodes(),
168 | 	{ok, []} = nkdist:get_procs(nkcluster_node_proxy),
169 | 	ok.
170 | 
171 | 
172 | 
173 | %%% Internal
174 | 
175 | event(NodeId, Line) ->
176 | 	receive 
177 | 		{test_job_class_event, NodeId, Msg} -> Msg
178 | 	after 1000 -> 
179 | 		error(Line)
180 | 	end.
181 | 
182 | wait_agent(NodeId, End) ->
183 | 	case lists:member(NodeId, nkcluster_nodes:get_nodes()) of
184 | 		true -> 
185 | 			ok;
186 | 		false -> 
187 | 			case nklib_util:timestamp() > End of
188 | 				true -> 
189 | 					error;
190 | 				false -> 
191 | 					timer:sleep(100),
192 | 					wait_agent(NodeId, End)
193 | 			end
194 | 	end.
195 | 
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/test/cluster_test.erl:
--------------------------------------------------------------------------------
 1 | %% -------------------------------------------------------------------
 2 | %%
 3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
 4 | %%
 5 | %% This file is provided to you under the Apache License,
 6 | %% Version 2.0 (the "License"); you may not use this file
 7 | %% except in compliance with the License.  You may obtain
 8 | %% a copy of the License at
 9 | %%
10 | %%   http://www.apache.org/licenses/LICENSE-2.0
11 | %%
12 | %% Unless required by applicable law or agreed to in writing,
13 | %% software distributed under the License is distributed on an
14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | %% KIND, either express or implied.  See the License for the
16 | %% specific language governing permissions and limitations
17 | %% under the License.
18 | %%
19 | %% -------------------------------------------------------------------
20 | 
21 | -module(cluster_test).
22 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
23 | 
24 | -compile([export_all]).
25 | -include_lib("nklib/include/nklib.hrl").
26 | -include_lib("eunit/include/eunit.hrl").
27 | 
28 | -define(RECV(Term), receive Term -> Term after 1000 -> error(?LINE) end).
29 | 
30 | -define(EVENT(NodeId), event(NodeId, ?LINE)).
31 | 
32 | basic_test_() ->
33 |   	{setup, 
34 |     	fun() -> ok = nkcluster_app:start() end,
35 | 		fun(_) -> ok end,
36 | 	    fun(_) -> 
37 | 		    [
38 | 				fun() -> master() end
39 | 			]
40 | 		end
41 |   	}.
42 | 
43 | 
44 | master() ->
45 | 	?debugMsg("Starting MASTER test"),
46 | 	{ok, Master1} = nkdist_gen_server:get_master(nkcluster_nodes),
47 | 	{ok, #{nkcluster_nodes:=[Master1]}} = nkdist:get_masters(),
48 | 
49 | 	{ok, VNode1} = nkdist:register(nkcluster_nodes),
50 | 	_ = ?RECV({nkdist_master, nkcluster_nodes, Master1}),
51 | 	Self = self(),
52 | 	{ok, #{nkcluster_nodes:=[Master1, Self]}} = nkdist:get_masters(),
53 | 
54 | 	% Lets register another thing, at the same vnode
55 | 	Spawn = spawn_link(
56 | 		fun() ->
57 | 			Self2 = self(),
58 | 			{ok, VNode1Id} = nkdist:get_vnode(nkcluster_nodes),
59 | 			{ok, VNode1} = nkdist_vnode:register(VNode1Id, test, Self2),
60 | 			_ = ?RECV({nkdist_master, test, Self2}),
61 | 			{ok, #{nkcluster_nodes:=[Master1, Self], test:=[Self2]}} = nkdist:get_masters(),
62 | 			receive	_Any -> error(?LINE)
63 | 			after 5000 -> ok
64 | 			end
65 | 		end),
66 | 	timer:sleep(500),
67 | 	{ok, #{nkcluster_nodes:=[Master1, Self], test:=[Spawn]}} = nkdist:get_masters(),
68 | 
69 | 	lager:warning("Next warning about vnode been killed is expected"),
70 | 	exit(VNode1, kill),
71 | 	
72 | 	% We have killed the vnode
73 | 	% nkdist_gen_server processes like nkcluster_nodes will reconnect
74 | 	% again, we must do it ourselves
75 | 	timer:sleep(500),
76 | 	false = is_process_alive(VNode1),
77 | 	{ok, #{nkcluster_nodes:=[Master1]}} = nkdist:get_masters(),
78 | 	{ok, _VNode2} = nkdist:register(nkcluster_nodes),
79 | 	% After register, we receive a notify
80 | 	_ = ?RECV({nkdist_master, nkcluster_nodes, Master1}),
81 | 	{ok, #{nkcluster_nodes:=[Master1, Self]}} = nkdist:get_masters(),
82 | 
83 | 	% If we kill the master, we will be elected
84 | 	exit(Master1, kill),
85 | 	_ = ?RECV({nkdist_master, nkcluster_nodes, Self}),
86 | 	% When the nkcluster_nodes process is restarted, we receive another notify
87 | 	_ = ?RECV({nkdist_master, nkcluster_nodes, Self}),
88 | 	timer:sleep(500),
89 | 	{ok, #{nkcluster_nodes:=[Self, Master2]}} = nkdist:get_masters(),
90 | 	Master2 = whereis(nkcluster_nodes),
91 | 	{ok, Self} = nkdist_gen_server:get_master(nkcluster_nodes),
92 | 	ok.
93 | 
94 | 


--------------------------------------------------------------------------------
/test/task_test.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | -module(task_test).
 22 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 23 | 
 24 | -compile([export_all]).
 25 | -include_lib("eunit/include/eunit.hrl").
 26 | 
 27 | -define(EVENT(NodeId), event(NodeId, ?LINE)).
 28 | -define(CLASS, test_job_class).
 29 | 
 30 | tasks_test_() ->
 31 |   	{setup,  
 32 |     	fun() -> 
 33 |     		ok = nkcluster_app:start(),
 34 | 			{ok, NodeId, _Info, _} = nkcluster_nodes:connect("nkcluster://127.0.0.1:15001", #{}),
 35 | 			NodeId
 36 | 		end,
 37 | 		fun(NodeId) -> 
 38 | 			nkcluster_agent:clear_cluster_addr(),
 39 | 			ok = nkcluster_nodes:stop(NodeId)
 40 | 		end,
 41 | 	    fun(NodeId) ->
 42 | 		    [
 43 | 		    	fun() -> req1(NodeId) end,
 44 | 		    	fun() -> req2(NodeId) end,
 45 | 		    	fun() -> task(NodeId) end,
 46 | 		    	fun() -> status(NodeId) end
 47 | 			]
 48 | 		end
 49 |   	}.
 50 | 
 51 | 
 52 | req1(NodeId) ->
 53 | 	?debugMsg("Starting REQ1 test"),
 54 | 
 55 | 	{ok, [{<<"test1">>, []}]} = nkcluster:get_meta(NodeId),
 56 | 	{ok, Meta1} = nkcluster:update_meta(NodeId, "meta2;a=1;b=2, meta3;master"),
 57 | 	{ok, [
 58 | 		{<<"test1">>, []},
 59 | 		{<<"meta2">>, [{<<"a">>,<<"1">>}, {<<"b">>, <<"2">>}]},
 60 | 		{<<"meta3">>, [<<"master">>]}]
 61 | 	} = {ok, Meta1} = nkcluster:get_meta(NodeId),
 62 | 	{ok, Meta2} = nkcluster:update_meta(NodeId, "meta2;c=2"),
 63 | 	{ok, [
 64 | 		{<<"test1">>, []},
 65 | 		{<<"meta2">>, [{<<"c">>,<<"2">>}]},
 66 | 		{<<"meta3">>, [<<"master">>]}]
 67 | 	} = {ok, Meta2} = nkcluster:get_meta(NodeId),
 68 | 	nkcluster:remove_meta(NodeId, "meta2, meta3"),
 69 | 	{ok, [{<<"test1">>, []}]} = nkcluster:get_meta(NodeId),
 70 | 
 71 | 	{ok, undefined} = nkcluster:get_data(NodeId, data1),
 72 | 	ok = nkcluster:put_data(NodeId, data1, value1),
 73 | 	{ok, value1} = nkcluster:get_data(NodeId, data1),
 74 | 	ok = nkcluster:del_data(NodeId, data1),
 75 | 	{ok, undefined} = nkcluster:get_data(NodeId, data1),
 76 | 
 77 | 	{reply, 'nkcluster@127.0.0.1'} = nkcluster:call(NodeId, erlang, node, [], 1000),
 78 | 	{reply, 'nkcluster@127.0.0.1'} = nkcluster:spawn_call(NodeId, erlang, node, [], 1000),
 79 | 	{reply, ok} = nkcluster:call(NodeId, timer, sleep, [100], 1000),
 80 | 	lager:notice("Next notice about 'unexpected response' is expected"),
 81 | 	{error, timeout} = nkcluster:call(NodeId, timer, sleep, [200], 100),
 82 | 
 83 | 	Bin1 = crypto:rand_bytes(500),
 84 | 	ok = file:write_file("/tmp/nkcluster.1", Bin1),
 85 | 	ok = nkcluster:send_file(NodeId, "/tmp/nkcluster.1", "/tmp/nkcluster.2"),
 86 | 	{ok, Bin1} = file:read_file("/tmp/nkcluster.1"),
 87 | 
 88 | 	Bin2 = crypto:rand_bytes(2*1024*1024),
 89 | 	ok = file:write_file("/tmp/nkcluster.1", Bin2),
 90 | 	ok = nkcluster:send_file(NodeId, "/tmp/nkcluster.1", "/tmp/nkcluster.2"),
 91 | 	{ok, Bin2} = file:read_file("/tmp/nkcluster.1"),
 92 | 
 93 | 	% ok = nkcluster:load_module(NodeId, ?MODULE),
 94 | 	% ok = nkcluster:load_modules(NodeId, nkcluster),
 95 | 
 96 | 	ok.
 97 | 
 98 | 
 99 | 
100 | req2(NodeId) ->
101 | 	?debugMsg("Starting REQ2 test"),
102 | 
103 | 	{reply, my_response} = nkcluster:request(NodeId, ?CLASS, my_request),
104 | 	{error, unknown_request} = nkcluster:request(NodeId, ?CLASS, other),
105 | 	{error, {{error, my_error}, _}} = 
106 | 		nkcluster:request(NodeId, ?CLASS, {error, my_error}),
107 | 	
108 | 	ok.
109 | 
110 | 
111 | task(NodeId) ->
112 | 	?debugMsg("Starting TASK test"),
113 | 	nklib_config:put(nkcluster_test, pid, self()),
114 | 
115 | 	{ok, []} = nkcluster:get_tasks(NodeId, ?CLASS),
116 | 	{ok, Task1} = nkcluster:task(NodeId, ?CLASS, task1),
117 | 	{nkcluster, {task_started, Task1}} = ?EVENT(NodeId),
118 | 	{ok, [Task1]} = nkcluster:get_tasks(NodeId, ?CLASS),
119 | 	{reply, reply2, Task2} = nkcluster:task(NodeId, ?CLASS, task2),
120 | 	{nkcluster, {task_started, Task2}} = ?EVENT(NodeId),
121 | 	{ok, List2} = nkcluster:get_tasks(NodeId, ?CLASS),
122 | 	true = lists:sort([Task1, Task2]) == lists:sort(List2),
123 | 
124 | 	{ok, Pid2} = nkcluster_jobs:get_task(?CLASS, Task2),
125 | 	exit(Pid2, kill),
126 | 	{nkcluster, {task_stopped, Task2, killed}} = ?EVENT(NodeId),
127 | 	{ok, [Task1]} = nkcluster:get_tasks(NodeId, ?CLASS),
128 | 
129 | 	{error, task_not_found} = nkcluster:command(NodeId, ?CLASS, Task2, cmd1),
130 | 	{reply, reply1} = nkcluster:command(NodeId, ?CLASS, Task1, cmd1),
131 | 	{reply, reply2} = nkcluster:command(NodeId, ?CLASS, Task1, cmd2),
132 | 	{reply, reply3} = nkcluster:command(NodeId, ?CLASS, Task1, cmd3),
133 | 	{nkcluster, {task_stopped, Task1, {error3, _}}} = ?EVENT(NodeId),
134 | 	{ok, []} = nkcluster:get_tasks(NodeId, ?CLASS),
135 | 
136 | 	{ok, Task3} = nkcluster:task(NodeId, ?CLASS, task1),
137 | 	{nkcluster, {task_started, Task3}} = ?EVENT(NodeId),
138 | 	{reply, reply4} = nkcluster:command(NodeId, ?CLASS, Task3, cmd4),
139 | 	event4 = ?EVENT(NodeId),
140 | 	{reply, stop} = nkcluster:command(NodeId, ?CLASS, Task3, stop),
141 | 	{nkcluster, {task_stopped, Task3, normal}} = ?EVENT(NodeId),
142 | 	{ok, []} = nkcluster:get_tasks(NodeId, ?CLASS),
143 | 
144 | 	ok.
145 | 	
146 | 
147 | 
148 | status(NodeId) ->
149 | 	?debugMsg("Starting TASK test"),
150 | 	nklib_config:put(nkcluster_test, pid, self()),
151 | 
152 | 	{ok, []} = nkcluster:get_tasks(NodeId, ?CLASS),
153 | 	{ok, Task1} = nkcluster:task(NodeId, ?CLASS, task1),
154 | 	{nkcluster, {task_started, Task1}} = ?EVENT(NodeId),
155 | 
156 | 	ok = nkcluster:set_status(NodeId, standby),
157 | 	{nkcluster, {node_status, standby}} = ?EVENT(NodeId),
158 | 	{status, Task1, standby} = ?EVENT(NodeId),
159 | 	{error, {node_not_ready, standby}} = nkcluster:task(NodeId, ?CLASS, task1),
160 | 
161 | 	ok = nkcluster:set_status(NodeId, stopped),
162 | 	{nkcluster, {node_status, stopping}} = ?EVENT(NodeId),
163 | 	{status, Task1, stopping} = ?EVENT(NodeId),
164 | 	%% The task receives the stop request and will stop after 0.5 secs
165 | 	{nkcluster, {task_stopped, Task1, normal}} = ?EVENT(NodeId),
166 | 	%% Agent will check every 1 sec to see if tasks have stopped
167 | 	{nkcluster, {node_status, stopped}} = ?EVENT(NodeId),
168 | 	
169 | 	ok = nkcluster:set_status(NodeId, ready),
170 | 	{nkcluster, {node_status, ready}} = ?EVENT(NodeId),
171 | 
172 | 	ok.
173 | 
174 | 
175 | 
176 | 
177 | 
178 | %%% Internal
179 | 
180 | event(NodeId, Line) ->
181 | 	receive 
182 | 		{test_job_class_event, NodeId, Msg} -> Msg
183 | 	after 5000 -> 
184 | 		error(Line)
185 | 	end.
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------
/test/test_job_class.erl:
--------------------------------------------------------------------------------
  1 | %% -------------------------------------------------------------------
  2 | %%
  3 | %% Copyright (c) 2015 Carlos Gonzalez Florido.  All Rights Reserved.
  4 | %%
  5 | %% This file is provided to you under the Apache License,
  6 | %% Version 2.0 (the "License"); you may not use this file
  7 | %% except in compliance with the License.  You may obtain
  8 | %% a copy of the License at
  9 | %%
 10 | %%   http://www.apache.org/licenses/LICENSE-2.0
 11 | %%
 12 | %% Unless required by applicable law or agreed to in writing,
 13 | %% software distributed under the License is distributed on an
 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | %% KIND, either express or implied.  See the License for the
 16 | %% specific language governing permissions and limitations
 17 | %% under the License.
 18 | %%
 19 | %% -------------------------------------------------------------------
 20 | 
 21 | %% @doc NkCLUSTER Core Worker Processes
 22 | -module(test_job_class).
 23 | -author('Carlos Gonzalez <carlosj.gf@gmail.com>').
 24 | -behaviour(nkcluster_job_class).
 25 | 
 26 | -export([request/2, task/2, command/3, status/2, event/2]).
 27 | 
 28 | 
 29 | %% ===================================================================
 30 | %% nkcluster_jobs
 31 | %% ===================================================================
 32 | 
 33 | %% @private
 34 | -spec request(term(), nkcluster_protocol:from()) ->
 35 |     {reply, term()} | {error, term()} | defer.
 36 | 
 37 | request(my_request, _From) ->
 38 |     {reply, my_response};
 39 | 
 40 | request({error, Error}, _From) ->
 41 |     error(Error);
 42 | 
 43 | request(_, _From) ->
 44 |     {error, unknown_request}.
 45 | 
 46 | 
 47 | %% @private
 48 | -spec task(nkcluster:task_id(), term()) ->
 49 |     {ok, pid()} | {ok, term(), pid()} | {error, term()}.
 50 | 
 51 | 
 52 | task(TaskId, task1) ->
 53 |     {ok, spawn_link(fun() -> my_task(TaskId) end)};
 54 | 
 55 | task(TaskId, task2) ->
 56 |     {ok, reply2, spawn_link(fun() -> my_task(TaskId) end)};
 57 | 
 58 | task(_TaskId, _) ->
 59 |     {error, unknown_task}.
 60 | 
 61 | 
 62 | %% @private
 63 | -spec command(pid(), nkcluster:command(), nkcluster_protocol:from()) ->
 64 |     {reply, ok} | defer.
 65 | 
 66 | command(Pid, cmd1, _From) ->
 67 |     Ref = make_ref(),
 68 |     Pid ! {cmd1, Ref, self()},
 69 |     receive
 70 |         {Ref, Res} -> {reply, Res}
 71 |     after  
 72 |         1000 -> error(?LINE)
 73 |     end;
 74 | 
 75 | command(Pid, cmd2, From) ->
 76 |     Pid ! {cmd2, From},
 77 |     defer;
 78 | 
 79 | command(Pid, cmd3, _From) ->
 80 |     Pid ! cmd3,
 81 |     {reply, reply3};
 82 | 
 83 | command(Pid, cmd4, _From) ->
 84 |     Pid ! cmd4,
 85 |     {reply, reply4};
 86 | 
 87 | command(Pid, stop, _From) ->
 88 |     Pid ! stop,
 89 |     {reply, stop};
 90 | 
 91 | command(_Pid, _Cmd, _From) ->
 92 |     {reply, unknown_command}.
 93 | 
 94 | 
 95 | -spec status(pid(), nkcluster:node_status()) ->
 96 |     ok.
 97 | 
 98 | status(Pid, Status) ->
 99 |     Pid ! {status, Status},
100 |     ok.
101 |     
102 | 
103 | %% @private
104 | -spec event(nkcluster:node_id(), nkcluster:event()) ->
105 |     ok.
106 | 
107 | event(NodeId, Data) ->
108 |     % lager:warning("W: ~p", [Data]),
109 |     case nklib_config:get(nkcluster_test, pid) of
110 |         Pid when is_pid(Pid) ->
111 |             Pid ! {test_job_class_event, NodeId, Data};
112 |         _ -> 
113 |             lager:info("Test Jobs Event: ~p", [{NodeId, Data}])
114 |     end.
115 | 
116 | 
117 | %%%% Internal
118 | 
119 | my_task(TaskId) ->
120 |     receive
121 |         {cmd1, Ref, Pid} -> 
122 |             Pid ! {Ref, reply1},
123 |             my_task(TaskId);
124 |         {cmd2, From} ->
125 |             nkcluster_jobs:reply(From, {reply, reply2}),
126 |             my_task(TaskId);
127 |         cmd3 ->
128 |             error(error3);
129 |         cmd4 ->
130 |             nkcluster_jobs:send_event(test_job_class, event4),
131 |             my_task(TaskId);
132 |         stop ->
133 |             ok;
134 |         {status, Status} ->
135 |             nkcluster_jobs:send_event(test_job_class, {status, TaskId, Status}),
136 |             case Status of
137 |                 stopping -> erlang:send_after(500, self(), stop);
138 |                 _ -> ok
139 |             end,
140 |             my_task(TaskId)
141 |     end.
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/test/vm.args:
--------------------------------------------------------------------------------
 1 | -pa deps/basho_stats/ebin
 2 | -pa deps/bear/ebin
 3 | -pa deps/clique/ebin
 4 | -pa deps/cowboy/ebin
 5 | -pa deps/cowlib/ebin
 6 | -pa deps/cuttlefish/ebin
 7 | -pa deps/down/ebin
 8 | -pa deps/eleveldb/ebin
 9 | -pa deps/eper/ebin
10 | -pa deps/exometer_core/ebin
11 | -pa deps/folsom/ebin
12 | -pa deps/getopt/ebin
13 | -pa deps/goldrush/ebin 
14 | -pa deps/gun/ebin
15 | -pa deps/jiffy/ebin
16 | -pa deps/jsx/ebin
17 | -pa deps/lager/ebin
18 | -pa deps/meck/ebin
19 | -pa deps/neotoma/ebin
20 | -pa deps/nkdist/ebin
21 | -pa deps/nkdocker/ebin
22 | -pa deps/nklib/ebin
23 | -pa deps/nkpacket/ebin
24 | -pa deps/parse_trans/ebin
25 | -pa deps/pbkdf2/ebin
26 | -pa deps/poolboy/ebin
27 | -pa deps/ranch/ebin
28 | -pa deps/riak_core/ebin
29 | -pa deps/riak_ensemble/ebin
30 | -pa deps/riak_sysmon/ebin
31 | -pa deps/setup/ebin
32 | -pa ../nkcluster/ebin
33 | 
34 | ## Name of the node
35 | -name nkcluster@127.0.0.1
36 | -setcookie nk
37 | 
38 | ## Treat error_logger warnings as warnings
39 | +W w
40 | 


--------------------------------------------------------------------------------
/util/dev1.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |         {cluster_name, test},
 4 |         {cluster_addr, "nkcluster://localhost:15001, nkcluster://localhost:15002, nkcluster://localhost:15003"}, 
 5 |         {type, primary},
 6 |         {meta, "control,node1,order;odd=true"},
 7 |         {listen, "nkcluster://localhost:15001"},
 8 |         {node_id, "dev1"}
 9 |     ]},
10 | 
11 |     {lager, [
12 |         {handlers, [
13 |             {lager_console_backend, info},
14 |             {lager_file_backend, [{file, "dev/1/log/error.log"}, {level, error}]},
15 |             {lager_file_backend, [{file, "dev/1/log/console.log"}, {level, info}]}
16 |         ]},
17 |         {error_logger_redirect, false},
18 |         {crash_log, "log/crash.log"},
19 |         {colored, true},
20 |         {colors, [
21 |             {debug,     "\e[0;38m" },
22 |             {info,      "\e[0;32m" },
23 |             {notice,    "\e[1;36m" },
24 |             {warning,   "\e[1;33m" },
25 |             {error,     "\e[1;31m" }
26 |         ]}
27 |     ]},
28 | 
29 |     {sasl, [
30 |         {sasl_error_logger, false}
31 |     ]},
32 | 
33 |     {riak_core, [
34 |         {schema_dirs, ["util"]},
35 |         {enable_consensus, false},
36 | 
37 | 
38 |         %% Cluster name
39 |         {cluster_name, "default"},
40 | 
41 |         %% Default location for ring, cluster and other data files
42 |         {platform_data_dir, "dev/1/data"},
43 | 
44 |         %% Default ring creation size.  Make sure it is a power of 2,
45 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
46 |         {ring_creation_size, 8},
47 | 
48 |         %% Default gossip interval (milliseconds)
49 |         {gossip_interval, 60000},
50 | 
51 |         %% Target N value
52 |         {target_n_val, 4},
53 | 
54 |         %% Default claims functions
55 |         {wants_claim_fun, {riak_core_claim, default_wants_claim}},
56 |         {choose_claim_fun, {riak_core_claim, default_choose_claim}},
57 | 
58 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
59 |         %% should return their data) in ms.
60 |         {vnode_inactivity_timeout, 10000},
61 | 
62 |         %% Number of VNodes allowed to do handoff concurrently.
63 |         {handoff_concurrency, 2},
64 | 
65 |         %% Disable Nagle on HTTP sockets
66 |         {disable_http_nagle, true},
67 | 
68 |         %% Handoff IP/port
69 |         {handoff_port, 8101},
70 |         {handoff_ip, "0.0.0.0"},
71 | 
72 |         %% Disterl buffer sizes in bytes.
73 |         %% These sizes (3*128*1024 & 6*128*1024) were
74 |         %% derived from a limited amount of testing in a 
75 |         %% 10GE environment, and may need tuning for your 
76 |         %% network and workload. In particular they're likely
77 |         %% too small to be optimal for larger object sizes.
78 |         {dist_send_buf_size, 393216},
79 |         {dist_recv_buf_size, 786432}
80 |     ]}
81 | ].
82 | 


--------------------------------------------------------------------------------
/util/dev2.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |         {cluster_name, test},
 4 |         {cluster_addr, "nkcluster://localhost:15001, nkcluster://localhost:15002, nkcluster://localhost:15003"}, 
 5 |         {type, primary},
 6 |         {meta, "control,node2"},
 7 |         {listen, "nkcluster://localhost:15002"},
 8 |         {node_id, "dev2"}
 9 |     ]},
10 | 
11 |     {lager, [
12 |         {handlers, [
13 |             {lager_console_backend, info},
14 |             {lager_file_backend, [{file, "dev/2/log/error.log"}, {level, error}]},
15 |             {lager_file_backend, [{file, "dev/2/log/console.log"}, {level, info}]}
16 |         ]},
17 |         {error_logger_redirect, false},
18 |         {crash_log, "log/crash.log"},
19 |         {colored, true},
20 |         {colors, [
21 |             {debug,     "\e[0;38m" },
22 |             {info,      "\e[0;32m" },
23 |             {notice,    "\e[1;36m" },
24 |             {warning,   "\e[1;33m" },
25 |             {error,     "\e[1;31m" }
26 |         ]}
27 |     ]},
28 | 
29 |     {sasl, [
30 |         {sasl_error_logger, false}
31 |     ]},
32 | 
33 |     {riak_core, [
34 |         {schema_dirs, ["util"]},
35 |         {enable_consensus, false},
36 | 
37 | 
38 |         %% Cluster name
39 |         {cluster_name, "default"},
40 | 
41 |         %% Default location for ring, cluster and other data files
42 |         {platform_data_dir, "dev/2/data"},
43 | 
44 |         %% Default ring creation size.  Make sure it is a power of 2,
45 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
46 |         {ring_creation_size, 8},
47 | 
48 |         %% Default gossip interval (milliseconds)
49 |         {gossip_interval, 60000},
50 | 
51 |         %% Target N value
52 |         {target_n_val, 4},
53 | 
54 |         %% Default claims functions
55 |         {wants_claim_fun, {riak_core_claim, default_wants_claim}},
56 |         {choose_claim_fun, {riak_core_claim, default_choose_claim}},
57 | 
58 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
59 |         %% should return their data) in ms.
60 |         {vnode_inactivity_timeout, 10000},
61 | 
62 |         %% Number of VNodes allowed to do handoff concurrently.
63 |         {handoff_concurrency, 2},
64 | 
65 |         %% Disable Nagle on HTTP sockets
66 |         {disable_http_nagle, true},
67 | 
68 |         %% Handoff IP/port
69 |         {handoff_port, 8102},
70 |         {handoff_ip, "0.0.0.0"},
71 | 
72 |         %% Disterl buffer sizes in bytes.
73 |         %% These sizes (3*128*1024 & 6*128*1024) were
74 |         %% derived from a limited amount of testing in a 
75 |         %% 10GE environment, and may need tuning for your 
76 |         %% network and workload. In particular they're likely
77 |         %% too small to be optimal for larger object sizes.
78 |         {dist_send_buf_size, 393216},
79 |         {dist_recv_buf_size, 786432}
80 |     ]}
81 | ].
82 | 


--------------------------------------------------------------------------------
/util/dev3.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |         {cluster_name, test},
 4 |         {cluster_addr, "nkcluster://localhost:15001, nkcluster://localhost:15002, nkcluster://localhost:15003"}, 
 5 |         {type, primary},
 6 |         {meta, "control,node3,order;odd=true"},
 7 |         {listen, "nkcluster://localhost:15003"},
 8 |         {node_id, "dev3"}
 9 |     ]},
10 | 
11 |     {lager, [
12 |         {handlers, [
13 |             {lager_console_backend, info},
14 |             {lager_file_backend, [{file, "dev/3/log/error.log"}, {level, error}]},
15 |             {lager_file_backend, [{file, "dev/3/log/console.log"}, {level, info}]}
16 |         ]},
17 |         {error_logger_redirect, false},
18 |         {crash_log, "log/crash.log"},
19 |         {colored, true},
20 |         {colors, [
21 |             {debug,     "\e[0;38m" },
22 |             {info,      "\e[0;32m" },
23 |             {notice,    "\e[1;36m" },
24 |             {warning,   "\e[1;33m" },
25 |             {error,     "\e[1;31m" }
26 |         ]}
27 |     ]},
28 | 
29 |     {sasl, [
30 |         {sasl_error_logger, false}
31 |     ]},
32 | 
33 |     {riak_core, [
34 |         {schema_dirs, ["util"]},
35 |         {enable_consensus, false},
36 | 
37 | 
38 |         %% Cluster name
39 |         {cluster_name, "default"},
40 | 
41 |         %% Default location for ring, cluster and other data files
42 |         {platform_data_dir, "dev/3/data"},
43 | 
44 |         %% Default ring creation size.  Make sure it is a power of 2,
45 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
46 |         {ring_creation_size, 8},
47 | 
48 |         %% Default gossip interval (milliseconds)
49 |         {gossip_interval, 60000},
50 | 
51 |         %% Target N value
52 |         {target_n_val, 4},
53 | 
54 |         %% Default claims functions
55 |         {wants_claim_fun, {riak_core_claim, default_wants_claim}},
56 |         {choose_claim_fun, {riak_core_claim, default_choose_claim}},
57 | 
58 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
59 |         %% should return their data) in ms.
60 |         {vnode_inactivity_timeout, 10000},
61 | 
62 |         %% Number of VNodes allowed to do handoff concurrently.
63 |         {handoff_concurrency, 2},
64 | 
65 |         %% Disable Nagle on HTTP sockets
66 |         {disable_http_nagle, true},
67 | 
68 |         %% Handoff IP/port
69 |         {handoff_port, 8103},
70 |         {handoff_ip, "0.0.0.0"},
71 | 
72 |         %% Disterl buffer sizes in bytes.
73 |         %% These sizes (3*128*1024 & 6*128*1024) were
74 |         %% derived from a limited amount of testing in a 
75 |         %% 10GE environment, and may need tuning for your 
76 |         %% network and workload. In particular they're likely
77 |         %% too small to be optimal for larger object sizes.
78 |         {dist_send_buf_size, 393216},
79 |         {dist_recv_buf_size, 786432}
80 |     ]}
81 | ].
82 | 


--------------------------------------------------------------------------------
/util/dev4.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |         {cluster_name, test},
 4 |         {cluster_addr, "nkcluster://localhost:15001, nkcluster://localhost:15002, nkcluster://localhost:15003"}, 
 5 |         {type, secondary},
 6 |         {meta, "node4"},
 7 |         {listen, "nkcluster://localhost:15004"},
 8 |         {node_id, "dev4"}
 9 |     ]},
10 | 
11 |     {lager, [
12 |         {handlers, [
13 |             {lager_console_backend, info},
14 |             {lager_file_backend, [{file, "dev/4/log/error.log"}, {level, error}]},
15 |             {lager_file_backend, [{file, "dev/4/log/console.log"}, {level, info}]}
16 |         ]},
17 |         {error_logger_redirect, false},
18 |         {crash_log, "log/crash.log"},
19 |         {colored, true},
20 |         {colors, [
21 |             {debug,     "\e[0;38m" },
22 |             {info,      "\e[0;32m" },
23 |             {notice,    "\e[1;36m" },
24 |             {warning,   "\e[1;33m" },
25 |             {error,     "\e[1;31m" }
26 |         ]}
27 |     ]},
28 | 
29 |     {sasl, [
30 |         {sasl_error_logger, false}
31 |     ]},
32 | 
33 |     {riak_core, [
34 |         {schema_dirs, ["util"]},
35 |         {enable_consensus, false},
36 | 
37 | 
38 |         %% Cluster name
39 |         {cluster_name, "default"},
40 | 
41 |         %% Default location for ring, cluster and other data files
42 |         {platform_data_dir, "dev/4/data"},
43 | 
44 |         %% Default ring creation size.  Make sure it is a power of 2,
45 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
46 |         {ring_creation_size, 8},
47 | 
48 |         %% Default gossip interval (milliseconds)
49 |         {gossip_interval, 60000},
50 | 
51 |         %% Target N value
52 |         {target_n_val, 4},
53 | 
54 |         %% Default claims functions
55 |         {wants_claim_fun, {riak_core_claim, default_wants_claim}},
56 |         {choose_claim_fun, {riak_core_claim, default_choose_claim}},
57 | 
58 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
59 |         %% should return their data) in ms.
60 |         {vnode_inactivity_timeout, 60000},
61 | 
62 |         %% Number of VNodes allowed to do handoff concurrently.
63 |         {handoff_concurrency, 2},
64 | 
65 |         %% Disable Nagle on HTTP sockets
66 |         {disable_http_nagle, true},
67 | 
68 |         %% Handoff IP/port
69 |         {handoff_port, 8104},
70 |         {handoff_ip, "0.0.0.0"},
71 | 
72 |         %% Disterl buffer sizes in bytes.
73 |         %% These sizes (3*128*1024 & 6*128*1024) were
74 |         %% derived from a limited amount of testing in a 
75 |         %% 10GE environment, and may need tuning for your 
76 |         %% network and workload. In particular they're likely
77 |         %% too small to be optimal for larger object sizes.
78 |         {dist_send_buf_size, 393216},
79 |         {dist_recv_buf_size, 786432}
80 |     ]}
81 | ].
82 | 


--------------------------------------------------------------------------------
/util/dev5.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |         {cluster_name, test},
 4 |         {cluster_addr, "nkcluster://localhost:15001, nkcluster://localhost:15002, nkcluster://localhost:15003"}, 
 5 |         {type, secondary},
 6 |         {meta, "node5,order;odd=true"},
 7 |         {listen, "nkcluster://localhost:15005"},
 8 |         {node_id, "dev5"}
 9 |     ]},
10 | 
11 |     {lager, [
12 |         {handlers, [
13 |             {lager_console_backend, info},
14 |             {lager_file_backend, [{file, "dev/5/log/error.log"}, {level, error}]},
15 |             {lager_file_backend, [{file, "dev/5/log/console.log"}, {level, info}]}
16 |         ]},
17 |         {error_logger_redirect, false},
18 |         {crash_log, "log/crash.log"},
19 |         {colored, true},
20 |         {colors, [
21 |             {debug,     "\e[0;38m" },
22 |             {info,      "\e[0;32m" },
23 |             {notice,    "\e[1;36m" },
24 |             {warning,   "\e[1;33m" },
25 |             {error,     "\e[1;31m" }
26 |         ]}
27 |     ]},
28 | 
29 |     {sasl, [
30 |         {sasl_error_logger, false}
31 |     ]},
32 | 
33 |     {riak_core, [
34 |         {schema_dirs, ["util"]},
35 |         {enable_consensus, false},
36 | 
37 | 
38 |         %% Cluster name
39 |         {cluster_name, "default"},
40 | 
41 |         %% Default location for ring, cluster and other data files
42 |         {platform_data_dir, "dev/5/data"},
43 | 
44 |         %% Default ring creation size.  Make sure it is a power of 2,
45 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
46 |         {ring_creation_size, 8},
47 | 
48 |         %% Default gossip interval (milliseconds)
49 |         {gossip_interval, 60000},
50 | 
51 |         %% Target N value
52 |         {target_n_val, 4},
53 | 
54 |         %% Default claims functions
55 |         {wants_claim_fun, {riak_core_claim, default_wants_claim}},
56 |         {choose_claim_fun, {riak_core_claim, default_choose_claim}},
57 | 
58 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
59 |         %% should return their data) in ms.
60 |         {vnode_inactivity_timeout, 60000},
61 | 
62 |         %% Number of VNodes allowed to do handoff concurrently.
63 |         {handoff_concurrency, 2},
64 | 
65 |         %% Disable Nagle on HTTP sockets
66 |         {disable_http_nagle, true},
67 | 
68 |         %% Handoff IP/port
69 |         {handoff_port, 8105},
70 |         {handoff_ip, "0.0.0.0"},
71 | 
72 |         %% Disterl buffer sizes in bytes.
73 |         %% These sizes (3*128*1024 & 6*128*1024) were
74 |         %% derived from a limited amount of testing in a 
75 |         %% 10GE environment, and may need tuning for your 
76 |         %% network and workload. In particular they're likely
77 |         %% too small to be optimal for larger object sizes.
78 |         {dist_send_buf_size, 393216},
79 |         {dist_recv_buf_size, 786432}
80 |     ]}
81 | ].
82 | 


--------------------------------------------------------------------------------
/util/dev_vm.args:
--------------------------------------------------------------------------------
 1 | -pa deps/basho_stats/ebin
 2 | -pa deps/bear/ebin
 3 | -pa deps/clique/ebin
 4 | -pa deps/cowboy/ebin
 5 | -pa deps/cowlib/ebin
 6 | -pa deps/cuttlefish/ebin
 7 | -pa deps/down/ebin
 8 | -pa deps/eleveldb/ebin
 9 | -pa deps/eper/ebin
10 | -pa deps/exometer_core/ebin
11 | -pa deps/folsom/ebin
12 | -pa deps/getopt/ebin
13 | -pa deps/goldrush/ebin 
14 | -pa deps/gun/ebin
15 | -pa deps/jiffy/ebin
16 | -pa deps/jsx/ebin
17 | -pa deps/lager/ebin
18 | -pa deps/meck/ebin
19 | -pa deps/neotoma/ebin
20 | -pa deps/nkdist/ebin
21 | -pa deps/nkdocker/ebin
22 | -pa deps/nklib/ebin
23 | -pa deps/nkpacket/ebin
24 | -pa deps/parse_trans/ebin
25 | -pa deps/pbkdf2/ebin
26 | -pa deps/poolboy/ebin
27 | -pa deps/ranch/ebin
28 | -pa deps/riak_core/ebin
29 | -pa deps/riak_ensemble/ebin
30 | -pa deps/riak_sysmon/ebin
31 | -pa deps/setup/ebin
32 | -pa ../nkcluster/ebin
33 | 
34 | ## Name of the node
35 | -setcookie nk
36 | 
37 | ## Treat error_logger warnings as warnings
38 | +W w
39 | 


--------------------------------------------------------------------------------
/util/riak_core.schema:
--------------------------------------------------------------------------------
  1 | %%-*- mode: erlang -*-
  2 | %% Default Bucket Properties
  3 | 
  4 | %% @doc The number of replicas stored. Note: See Replication
  5 | %% Properties for further discussion.
  6 | %% http://docs.basho.com/riak/latest/dev/advanced/cap-controls/
  7 | {mapping, "buckets.default.n_val", "riak_core.default_bucket_props.n_val", [
  8 |   {datatype, integer},
  9 |   {default, 3},
 10 |   hidden
 11 | ]}.
 12 | 
 13 | %% @doc Number of partitions in the cluster (only valid when first
 14 | %% creating the cluster). Must be a power of 2, minimum 8 and maximum
 15 | %% 1024.
 16 | {mapping, "ring_size", "riak_core.ring_creation_size", [
 17 |   {datatype, integer},
 18 |   {default, 64},
 19 |   {validators, ["ring_size^2", "ring_size_max", "ring_size_min"]},
 20 |   {commented, 64}
 21 | ]}.
 22 | 
 23 | %% ring_size validators
 24 | {validator, "ring_size_max", "2048 and larger are supported, but considered advanced config",
 25 |  fun(Size) ->
 26 |   Size =< 1024
 27 |  end}.
 28 | 
 29 | {validator, "ring_size^2", "not a power of 2",
 30 |  fun(Size) ->
 31 |   (Size band (Size-1) =:= 0)
 32 |  end}.
 33 | 
 34 | {validator, "ring_size_min", "must be at least 8",
 35 |  fun(Size) ->
 36 |   Size >= 8
 37 |  end}.
 38 | 
 39 | %% @doc Number of concurrent node-to-node transfers allowed.
 40 | {mapping, "transfer_limit", "riak_core.handoff_concurrency", [
 41 |   {datatype, integer},
 42 |   {default, 2},
 43 |   {commented, 2}
 44 | ]}.
 45 | 
 46 | %% @doc Default location of ringstate
 47 | {mapping, "ring.state_dir", "riak_core.ring_state_dir", [
 48 |   {datatype, directory},
 49 |   {default, "$(platform_data_dir)/ring"},
 50 |   hidden
 51 | ]}.
 52 | 
 53 | %% @doc Default cert location for https can be overridden
 54 | %% with the ssl config variable, for example:
 55 | {mapping, "ssl.certfile", "riak_core.ssl.certfile", [
 56 |   {datatype, file},
 57 |   {commented, "$(platform_etc_dir)/cert.pem"}
 58 | ]}.
 59 | 
 60 | %% @doc Default key location for https can be overridden with the ssl
 61 | %% config variable, for example:
 62 | {mapping, "ssl.keyfile", "riak_core.ssl.keyfile", [
 63 |   {datatype, file},
 64 |   {commented, "$(platform_etc_dir)/key.pem"}
 65 | ]}.
 66 | 
 67 | %% @doc Default signing authority location for https can be overridden
 68 | %% with the ssl config variable, for example:
 69 | {mapping, "ssl.cacertfile", "riak_core.ssl.cacertfile", [
 70 |   {datatype, file},
 71 |   {commented, "$(platform_etc_dir)/cacertfile.pem"}
 72 | ]}.
 73 | 
 74 | %% @doc handoff.port is the TCP port that Riak uses for
 75 | %% intra-cluster data handoff.
 76 | {mapping, "handoff.port", "riak_core.handoff_port", [
 77 |   {default, {{handoff_port}} },
 78 |   {datatype, integer},
 79 |   hidden
 80 | ]}.
 81 | 
 82 | %% @doc To encrypt riak_core intra-cluster data handoff traffic,
 83 | %% uncomment the following line and edit its path to an appropriate
 84 | %% certfile and keyfile.  (This example uses a single file with both
 85 | %% items concatenated together.)
 86 | {mapping, "handoff.ssl.certfile", "riak_core.handoff_ssl_options.certfile", [
 87 | %%  {commented, "/tmp/erlserver.pem"},
 88 |   {datatype, file},
 89 |   hidden
 90 | ]}.
 91 | 
 92 | %% @doc if you need a seperate keyfile for handoff
 93 | {mapping, "handoff.ssl.keyfile", "riak_core.handoff_ssl_options.keyfile", [
 94 |   {datatype, file},
 95 |   hidden
 96 | ]}.
 97 | 
 98 | %% @doc Enables/disables outbound handoff transfers for this node. If you
 99 | %% turn this setting off at runtime with riak-admin, it will kill any
100 | %% outbound handoffs currently running.
101 | {mapping, "handoff.outbound", "riak_core.disable_outbound_handoff", [
102 |   {default, on},
103 |   {datatype, {flag, off, on}},
104 |   hidden
105 | ]}.
106 | 
107 | %% @doc Enables/disables inbound handoff transfers for this node. If you
108 | %% turn this setting off at runtime with riak-admin, it will kill any
109 | %% inbound handoffs currently running.
110 | {mapping, "handoff.inbound", "riak_core.disable_inbound_handoff", [
111 |   {default, on},
112 |   {datatype, {flag, off, on}},
113 |   hidden
114 | ]}.
115 | 
116 | %% @doc DTrace support Do not enable 'dtrace' unless your Erlang/OTP
117 | %% runtime is compiled to support DTrace.  DTrace is available in
118 | %% R15B01 (supported by the Erlang/OTP official source package) and in
119 | %% R14B04 via a custom source repository & branch.
120 | {mapping, "dtrace", "riak_core.dtrace_support", [
121 |   {default, off},
122 |   {datatype, flag}
123 | ]}.
124 | 
125 | %% @doc Platform-specific installation paths (substituted by rebar)
126 | {mapping, "platform_bin_dir", "riak_core.platform_bin_dir", [
127 |   {datatype, directory},
128 |   {default, "{{platform_bin_dir}}"}
129 | ]}.
130 | 
131 | %% @see platform_bin_dir
132 | {mapping, "platform_data_dir", "riak_core.platform_data_dir", [
133 |   {datatype, directory},
134 |   {default, "{{platform_data_dir}}"}
135 | ]}.
136 | 
137 | %% @see platform_bin_dir
138 | {mapping, "platform_etc_dir", "riak_core.platform_etc_dir", [
139 |   {datatype, directory},
140 |   {default, "{{platform_etc_dir}}"}
141 | ]}.
142 | 
143 | %% @see platform_bin_dir
144 | {mapping, "platform_lib_dir", "riak_core.platform_lib_dir", [
145 |   {datatype, directory},
146 |   {default, "{{platform_lib_dir}}"}
147 | ]}.
148 | 
149 | %% @see platform_bin_dir
150 | {mapping, "platform_log_dir", "riak_core.platform_log_dir", [
151 |   {datatype, directory},
152 |   {default, "{{platform_log_dir}}"}
153 | ]}.
154 | 
155 | %% @doc Enable consensus subsystem. Set to 'on' to enable the
156 | %% consensus subsystem used for strongly consistent Riak operations.
157 | {mapping, "strong_consistency", "riak_core.enable_consensus", [
158 |   {datatype, flag},
159 |   {default, off},
160 |   {commented, on}
161 | ]}.
162 | 
163 | %% @doc Whether to enable the background manager globally. When
164 | %% enabled, participating Riak subsystems will coordinate access to
165 | %% shared resources. This will help to prevent system response
166 | %% degradation under times of heavy load from multiple background
167 | %% tasks. Specific subsystems may also have their own controls over
168 | %% use of the background manager.
169 | {mapping, "background_manager", "riak_core.use_background_manager", [
170 |     {datatype, flag},
171 |     {default, off},
172 |     hidden
173 | ]}.
174 | 
175 | %% @doc Interval of time between vnode management
176 | %% activities. Modifying this will change the amount of time between
177 | %% attemps to trigger handoff between this node and any other member
178 | %% of the cluster.
179 | {mapping, "vnode_management_timer", "riak_core.vnode_management_timer", [
180 |     {default, "10s"},
181 |     {datatype, {duration, ms}},
182 |     hidden
183 | ]}.
184 | 


--------------------------------------------------------------------------------
/util/shell_app.config:
--------------------------------------------------------------------------------
 1 | [
 2 |     {nkcluster, [
 3 |     ]},
 4 | 
 5 |     {lager, [
 6 |         {handlers, [
 7 |             {lager_console_backend, info},
 8 |             {lager_file_backend, [{file, "log/error.log"}, {level, error}]},
 9 |             {lager_file_backend, [{file, "log/console.log"}, {level, info}]}
10 |         ]},
11 |         {error_logger_redirect, false},
12 |         {crash_log, "log/crash.log"},
13 |         {colored, true},
14 |         {colors, [
15 |             {debug,     "\e[0;38m" },
16 |             {info,      "\e[0;32m" },
17 |             {notice,    "\e[1;36m" },
18 |             {warning,   "\e[1;33m" },
19 |             {error,     "\e[1;31m" }
20 |         ]}
21 |     ]},
22 | 
23 |     {sasl, [
24 |         {sasl_error_logger, false}
25 |     ]},
26 | 
27 |     {riak_core, [
28 |         {schema_dirs, ["deps/nkdist/util"]},
29 |         % {enable_consensus, true},
30 | 
31 |         %% Cluster name
32 |         {cluster_name, "default"},
33 | 
34 |         %% Default location for ring, cluster and other data files
35 |         {platform_data_dir, "data"},
36 | 
37 |         %% Default ring creation size.  Make sure it is a power of 2,
38 |         %% e.g. 16, 32, 64, 128, 256, 512 etc
39 |         {ring_creation_size, 8},
40 | 
41 |         %% Default gossip interval (milliseconds)
42 |         {gossip_interval, 60000},
43 | 
44 |         %% Target N value
45 |         {target_n_val, 4},
46 | 
47 |         % %% Default claims functions
48 |         % {wants_claim_fun, {riak_core_claim, default_wants_claim}},
49 |         % {choose_claim_fun, {riak_core_claim, default_choose_claim}},
50 | 
51 |         %% Vnode inactivity timeout (how often to check if fallback vnodes
52 |         %% should return their data) in ms.
53 |         {vnode_inactivity_timeout, 60000},
54 | 
55 |         %% Number of VNodes allowed to do handoff concurrently.
56 |         {handoff_concurrency, 2}
57 |     ]}
58 | 
59 | 
60 | ].
61 | 


--------------------------------------------------------------------------------
/util/shell_vm.args:
--------------------------------------------------------------------------------
 1 | -pa deps/basho_stats/ebin
 2 | -pa deps/bear/ebin
 3 | -pa deps/clique/ebin
 4 | -pa deps/cowboy/ebin
 5 | -pa deps/cowlib/ebin
 6 | -pa deps/cuttlefish/ebin
 7 | -pa deps/down/ebin
 8 | -pa deps/eleveldb/ebin
 9 | -pa deps/eper/ebin
10 | -pa deps/exometer_core/ebin
11 | -pa deps/folsom/ebin
12 | -pa deps/getopt/ebin
13 | -pa deps/goldrush/ebin 
14 | -pa deps/gun/ebin
15 | -pa deps/jiffy/ebin
16 | -pa deps/jsx/ebin
17 | -pa deps/lager/ebin
18 | -pa deps/meck/ebin
19 | -pa deps/neotoma/ebin
20 | -pa deps/nkdist/ebin
21 | -pa deps/nkdocker/ebin
22 | -pa deps/nklib/ebin
23 | -pa deps/nkpacket/ebin
24 | -pa deps/parse_trans/ebin
25 | -pa deps/pbkdf2/ebin
26 | -pa deps/poolboy/ebin
27 | -pa deps/ranch/ebin
28 | -pa deps/riak_core/ebin
29 | -pa deps/riak_ensemble/ebin
30 | -pa deps/riak_sysmon/ebin
31 | -pa deps/setup/ebin
32 | -pa ../nkcluster/ebin
33 | 
34 | ## Name of the node
35 | -name nkcluster@127.0.0.1
36 | -setcookie nk
37 | 
38 | ## Treat error_logger warnings as warnings
39 | +W w
40 | 


--------------------------------------------------------------------------------