├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── assets └── Snakepit_overview.png ├── bin ├── clean-images.sh ├── clean-service.sh ├── db-cat.sh ├── db-clean.sh ├── db-drop.sh ├── db-dump.sh ├── db-init.sh ├── db-restore.sh ├── prepare-directories.sh ├── prepare-images.sh ├── prepare-lxd.sh ├── prepare-service.sh └── publish-images.sh ├── package.json ├── scripts ├── clean.sh ├── daemon │ ├── data-ro.mount │ ├── run.sh │ ├── setup.sh │ ├── snakepit.service │ └── sshd_config ├── keygen.sh ├── nodemon.service ├── prepare.sh ├── scan.sh ├── setup-service.sh ├── snakepit.service └── worker │ ├── 20auto-upgrades │ ├── apt │ ├── apt-get │ ├── forwarder │ ├── forwarder.js │ ├── forwarder.sh │ ├── package-lock.json │ └── package.json │ ├── run.sh │ ├── setup.sh │ └── snakepit.service └── src ├── clusterParser.pegjs ├── config.js ├── models ├── Alias-model.js ├── Allocation-model.js ├── Group-model.js ├── Job-model.js ├── Node-model.js ├── Pit-model.js ├── Process-model.js ├── ProcessGroup-model.js ├── Resource-model.js ├── State-model.js ├── User-model.js ├── db.js └── index.js ├── pitRunner.js ├── reservations.js ├── routes ├── aliases.js ├── groups.js ├── index.js ├── jobs.js ├── mw.js ├── nodes.js └── users.js ├── scheduler.js ├── service.js └── utils ├── async.js ├── clusterEvents.js ├── dateTime.js ├── logger.js ├── lxd.js ├── scripts.js └── simplefs.js /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | src/clusterParser.js 3 | .pitconnect.txt 4 | .pituser.txt 5 | *.pem 6 | *.key 7 | *.crt 8 | 9 | # Logs 10 | logs 11 | *.log 12 | npm-debug.log* 13 | yarn-debug.log* 14 | yarn-error.log* 15 | 16 | # Runtime data 17 | pids 18 | *.pid 19 | *.seed 20 | *.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | lib-cov 24 | 25 | # Coverage directory used by tools like istanbul 26 | coverage 27 | 28 | # nyc test coverage 29 | .nyc_output 30 | 31 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 32 | .grunt 33 | 34 | # Bower dependency directory (https://bower.io/) 35 | bower_components 36 | 37 | # node-waf configuration 38 | .lock-wscript 39 | 40 | # Compiled binary addons (http://nodejs.org/api/addons.html) 41 | build/Release 42 | 43 | # Dependency directories 44 | node_modules/ 45 | jspm_packages/ 46 | 47 | # Typescript v1 declaration files 48 | typings/ 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | 68 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Snakepit 2 | 3 | Snakepit is a machine learning job scheduler with the following features: 4 | - Scheduling of concurrent machine learning jobs 5 | - Support for multi-machine and multi-GPU jobs 6 | - Job's tasks running in interconnected LXD containers with "root" access 7 | - Built-in user and group management 8 | - Jobs get access to training data according to user's access rights 9 | - Remote access through command line client over HTTP API 10 | - Remote data access through FUSE mounts (even during training) 11 | 12 | __The Snakepit service has not gone through an in-depth security-audit yet. 13 | Therefore you should not offer unknown/random users access to your service.__ 14 | 15 | ## Getting Started 16 | 17 | This following instructions are intended for administrative Snakepit users 18 | who want to configure and run an own Snakepit cluster. 19 | 20 | If you are a Snakepit end-user and just want to know how to run jobs 21 | on an existing Snakepit cluster, 22 | you should follow the [snakepit-client user-guide](https://github.com/mozilla/snakepit-client/) 23 | 24 | ### Big picture 25 | 26 | ![Overview - three jobs on a Snakepit cluster](assets/Snakepit_overview.png) 27 | 28 | - The typical setup for a Snakepit machine learning cluster is to have a so called __head node__ machine and a bunch of __worker node__ machines. 29 | - The head node is typically hosting the __Snakepit service__ and has/provides access to the outer world. In our scenario it also contains all the (training) data. 30 | - The worker nodes are connected to the head node through a (high speed) local network and are typically equipped with __GPUs__. You can also run a one-machine setup (e.g. for testing or development) where head node and worker nodes are essentially on one machine. 31 | 32 | - A job is started on the user's computer through snakepit's command-line client from within a git repository checkout. 33 | - The client sends all relevant information of the checkout (address of repository, hash, diff, ...) through Snakepit's __HTTP API__ to the Snakepit service. 34 | - The snakepit service now starts a so called __pit__ (like a "pod" in Kubernetes). 35 | - A pit (and therefore a job) consists of __processes__ and associated data (checkout of job repository). 36 | - Each process is represented by its own __LXD container__ which is a para-virtualized environment using Linux-namespaces (think of a lightweight virtual machine). 37 | - For each job (pit) there is exactly one so called __daemon process__ which will run on the head-node. Its responsibility is to provide data to the other processes of the pit. 38 | - The so called __worker processes__ of a pit can access the provided data through __sshfs__ mounts to the daemon process. 39 | - Each worker process executes the same __.compute__ script of the job (which is typically taken from the job's repository checkout). 40 | - All worker processes are running on worker nodes and each of them has exclusive access to its allocated sub-set of resources on it (typically GPUs). 41 | 42 | ### Prerequisites 43 | 44 | * At least one machine with at least one GPU (at the moment there is only support for Nvidia GPUs) 45 | * Latest Nvidia drivers for each GPU 46 | * [LXD](https://linuxcontainers.org/lxd/) (3.0+) installed on each machine 47 | * A front-end web-server of your choice on the main machine (optional but recommended) 48 | * git 49 | 50 | ### Configuring LXD 51 | 52 | Before Snakepit can get installed, LXD has to be configured on 53 | all involved machines (if not already done). 54 | So on each machine of your cluster you have to call 55 | ``` 56 | $ sudo lxd init 57 | ``` 58 | During the following questionnaire you'll be asked, if you want to create a new storage pool. 59 | It is highly recommended to create a copy-on-write one on base of `zfs` or `btrfs`. 60 | Each machine's storage pool should have at least 10 GB of space. 61 | On the following question you should respond with `yes`: 62 | ``` 63 | Would you like LXD to be available over the network (yes/no) [default=no]? yes 64 | ``` 65 | You'll be asked to set a password which will be required later during Snakepit's setup. 66 | 67 | __After Snakepit is configured and/or the machine got added, you should unset it again:__ 68 | ``` 69 | $ lxc config unset core.trust_password 70 | ``` 71 | 72 | ### Installing 73 | 74 | All the following steps are only to be done on the head node. 75 | First you have do create a Snakepit user: 76 | ``` 77 | $ sudo adduser snakepit 78 | [...] 79 | ``` 80 | 81 | First clone the Snakepit project. 82 | From within Snakepit's project root, you can now call: 83 | ``` 84 | /path/to/snakepit/clone$ sudo bin/prepare-directories.sh snakepit /snakepit 85 | ``` 86 | This will create the required data directory structure in `/snakepit` owned by user `snakepit`. 87 | This directory is from now on called "data-root". 88 | You could also pick a different path. 89 | 90 | Now it's time to prepare the snakepit service: 91 | ``` 92 | /path/to/snakepit/clone$ sudo bin/prepare-service.sh /snakepit /path/to/snakepit/clone 93 | ``` 94 | This will create the `snakepit` LXD container and bind the data-root to its internal directory `/data` 95 | and `/path/to/snakepit/clone` to its internal directory `/code`. If you omit `/path/to/snakepit/clone`, 96 | the script will clone the project another time within the container into `/code`. 97 | The script is also automatically mapping the outer directory-owner of 98 | the data-root (in our case user `snakepit`) to its inner `root` user. 99 | 100 | If you get a line with "Problem accessing https://...:8443", you have to figure out the URL for 101 | the local LXD service and run the provided command. 102 | The `bin/prepare-service.sh` script looks for the `lxdbr0` bridge network adapter (this is a default one in LXD). 103 | If not existing, it will create and attach it to the snakepit service container as `eth0`. 104 | The following commands will help you figuring out the service address: 105 | * `sudo lxc exec snakepit -- ip addr` lists all interfaces of the snakepit service container and their IP addresses 106 | * `sudo lxc network list` shows all LXD networks 107 | * `sudo lxc network show ` shows details and addresses of one network 108 | * `sudo lxc exec snakepit -- curl -k https://
:8443/1.0` tests an address from inside the snakepit service container 109 | 110 | Next step is to create the worker and daemon LXD container images: 111 | ``` 112 | /path/to/snakepit/clone$ sudo bin/prepare-images.sh 113 | ``` 114 | This is a highly automated process and should not require any interaction. 115 | 116 | After this you have the chance to install any required software into the worker image: 117 | ``` 118 | /path/to/snakepit/clone$ sudo lxc exec snakepit-worker -- bash 119 | root@snakepit-worker:/root# apt install some-requirement 120 | [...] 121 | root@snakepit-worker:/root# exit 122 | ``` 123 | 124 | Before the images can be used, you have to publish them: 125 | ``` 126 | /path/to/snakepit/clone$ sudo bin/publish-images.sh 127 | ``` 128 | 129 | ### Configuring NFS 130 | 131 | NFS is used for job data access. sshFS was used previously, but new workloads benefit from the faster disk access NFS allows. 132 | 133 | Steps below assume the following internal networking layout. Adjust accordingly if different. 134 | 135 | ```bash 136 | head node is at 192.168.1.1 137 | worker nodes are at 192.168.2.1, 192.168.3.1, etc 138 | ``` 139 | 140 | #### Configure NFS on the head node 141 | 142 | On the head node, install the nfs-server package. 143 | 144 | ```bash 145 | $ sudo apt install nfs-kernel-server 146 | ``` 147 | 148 | As root, add the following line to the `/etc/exports` file. 149 | 150 | ```bash 151 | /snakepit 192.168.0.0/16(rw,no_root_squash,no_subtree_check) 152 | ``` 153 | 154 | Then restart with `systemctl restart nfs-server`. Verify exports are working with `exportfs`. 155 | 156 | #### Configure NFS on the worker nodes 157 | 158 | The steps below need to be done on each worker node. 159 | 160 | Install the nfs client package. 161 | 162 | ```bash 163 | $ sudo apt install nfs-common 164 | ``` 165 | 166 | Determine the UID and GID of the snakepit user on the head node. 167 | 168 | ```bash 169 | # on the head node 170 | 171 | # from the system 172 | $ id snakepit 173 | uid=1777(snakepit) gid=1777(snakepit) groups=1777(snakepit),27(sudo),110(lxd) 174 | 175 | # from snakepit config 176 | $ lxc exec snakepit -- cat /etc/snakepit/snakepit.conf | grep mountUid 177 | mountUid: "1777" 178 | ``` 179 | 180 | Create a snakepit user with the same UID and GID as on the head node. 181 | 182 | NFS won't work if the UID is not the same. 183 | 184 | ```bash 185 | $ sudo addgroup --gid 1777 snakepit 186 | $ sudo adduser --uid 1777 --gid 1777 --disabled-password --gecos '' snakepit 187 | ``` 188 | 189 | Create the mount point. 190 | 191 | ```bash 192 | $ sudo mkdir /mnt/snakepit 193 | ``` 194 | 195 | Edit /etc/fstab as root. Add the following line. 196 | 197 | ```bash 198 | 192.168.1.1:/snakepit /mnt/snakepit nfs nosuid,hard,tcp,bg,noatime 0 0 199 | ``` 200 | 201 | Mount and verify that it's working. 202 | 203 | ```bash 204 | $ sudo mount /mnt/snakepit 205 | $ ls -la /mnt/snakepit 206 | # there should be files owned by snakepit:snakepit 207 | ``` 208 | 209 | ### Access to Snakepit service 210 | 211 | The snakepit service itself only provides unencrypted HTTP access. 212 | Therefore it is highly recommended to run snakepit behind a front-end web server with HTTPS configuration. 213 | The front-end server has to forward requests to port 80 of the address of the `eth0` interface of 214 | the snakepit service (`sudo lxc exec snakepit -- ip addr`). 215 | You can check connectivity through 216 | ``` 217 | $ curl http:///hello 218 | Here I am 219 | ``` 220 | 221 | For clients to be able to connect to the service, they have to have access to a so called `.pitconnect.txt` file. 222 | Its first line has to be the (outer) service URL without trailing slash. 223 | If you have/want to go for a self-signed HTTPS certificate of your front-end server, 224 | you can add the certificate content under that first line in the `.pitconnect.txt` file. 225 | The `.pitconnect.txt` is considered public and in case of a self-signed certificate 226 | it is to be distributed to users on a separate channel (like email). 227 | The snakepit client will only accept valid certificates or the one provided through the `.pitconnect.txt` file. 228 | 229 | ### First time use 230 | 231 | For the following steps you have to first [install the snakepit client](https://github.com/mozilla/snakepit-client/#installation). 232 | 233 | Within a directory that contains the `.pitconnect.txt` file (from the last step), 234 | you can now test your configuration end-to-end: 235 | ``` 236 | $ pit status 237 | No user info found. Seems like a new user or first time login from this machine. 238 | Please enter an existing or new username: tilman 239 | Found no user of that name. 240 | Do you want to register this usename (yes|no)? yes 241 | Full name: Tilman Kamp 242 | E-Mail address: ... 243 | New password: ************ 244 | Reinput a same one to confirm it: ************ 245 | JOB S SINCE UC% UM% USER TITLE RESOURCE 246 | ``` 247 | 248 | As you are the first user, Snakepit automatically granted you admin rights: 249 | ``` 250 | $ pit show me 251 | Username: tilman 252 | Full name: Tilman Kamp 253 | E-Mail address: ... 254 | Is administrator: yes 255 | ``` 256 | 257 | ### Adding nodes 258 | 259 | Before one can run jobs on a worker node, the node has to be added to the snakepit service: 260 | ``` 261 | $ pit add node:n0 endpoint=https://...:8443 262 | LXD endpoint password: ********** 263 | ``` 264 | Here we gave the node the short-name "n0" and its LXD API URL as endpoint. 265 | The password is the one that was specified during LXD configuration of the node. 266 | If the node has been added successfully, this password should be unset (see LXD config section). 267 | 268 | If the node had been added successfully, you should take a look at the node's GPUs (also called resources): 269 | ``` 270 | $ pit show node:n0 271 | Node name: n0 272 | State: ONLINE 273 | Resources: 274 | 0: "GeForce GTX 1070" (cuda 0) 275 | 1: "GeForce GTX 1070" (cuda 1) 276 | ``` 277 | 278 | Time to define a model name alias: 279 | ``` 280 | $ pit add alias:gtx1070 name="GeForce GTX 1070" 281 | $ pit show node:n0 282 | Node name: n0 283 | State: ONLINE 284 | Resources: 285 | 0: "GeForce GTX 1070" aka "gtx1070" (cuda 0) 286 | 1: "GeForce GTX 1070" aka "gtx1070" (cuda 1) 287 | ``` 288 | 289 | Time to run a first test job: 290 | ``` 291 | $ pit run "First light" [2:gtx1070] -d 'cat /proc/driver/nvidia/gpus/**/*' -l 292 | Job number: 190 293 | Remote: origin 294 | Hash: ... 295 | Diff LoC: 0 296 | Resources: "[2:gtx1070]" 297 | 298 | [2018-12-14 17:04:58] [daemon] Pit daemon started 299 | [2018-12-14 17:05:01] [worker 0] Worker 0 started 300 | [2018-12-14 17:05:01] [worker 0] Model: GeForce GTX 1070 301 | [2018-12-14 17:05:01] [worker 0] IRQ: 139 302 | [2018-12-14 17:05:01] [worker 0] GPU UUID: ... 303 | [2018-12-14 17:05:01] [worker 0] Video BIOS: 86.04.26.00.80 304 | [2018-12-14 17:05:01] [worker 0] Bus Type: PCIe 305 | [2018-12-14 17:05:01] [worker 0] DMA Size: 47 bits 306 | [2018-12-14 17:05:01] [worker 0] DMA Mask: 0x7fffffffffff 307 | [2018-12-14 17:05:01] [worker 0] Bus Location: 0000:01:00.0 308 | [2018-12-14 17:05:01] [worker 0] Device Minor: 0 309 | [2018-12-14 17:05:01] [worker 0] Blacklisted: No 310 | [2018-12-14 17:05:01] [worker 0] Binary: "" 311 | [2018-12-14 17:05:01] [worker 0] Model: GeForce GTX 1070 312 | [2018-12-14 17:05:01] [worker 0] IRQ: 142 313 | [2018-12-14 17:05:01] [worker 0] GPU UUID: ... 314 | [2018-12-14 17:05:01] [worker 0] Video BIOS: 86.04.26.00.80 315 | [2018-12-14 17:05:01] [worker 0] Bus Type: PCIe 316 | [2018-12-14 17:05:01] [worker 0] DMA Size: 47 bits 317 | [2018-12-14 17:05:01] [worker 0] DMA Mask: 0x7fffffffffff 318 | [2018-12-14 17:05:01] [worker 0] Bus Location: 0000:02:00.0 319 | [2018-12-14 17:05:01] [worker 0] Device Minor: 1 320 | [2018-12-14 17:05:01] [worker 0] Blacklisted: No 321 | [2018-12-14 17:05:01] [worker 0] Binary: "" 322 | [2018-12-14 17:05:01] [worker 0] Worker 0 ended with exit code 0 323 | [2018-12-14 17:05:01] [daemon] Worker 0 requested stop. Stopping pit... 324 | ``` 325 | 326 | __Et voilà - you got your first snakepit cluster.__ 327 | For further understanding of jobs and their runtime environment, 328 | refer to the [snakepit-client user-guide](https://github.com/mozilla/snakepit-client/). 329 | 330 | ## Configuration 331 | 332 | The configuration of the snakepit service is read from a YAML file at 333 | `/etc/snakepit/snakepit.conf` inside the snakepit container. 334 | You can edit it through vim: 335 | ``` 336 | $ sudo lxc exec snakepit -- vim /etc/snakepit/snakepit.conf 337 | $ sudo lxc exec snakepit -- systemctl restart snakepit 338 | ``` 339 | 340 | Possible configuration values are: 341 | - interface: Interface(s) to bind the service to - default 0.0.0.0 (all) 342 | - port: Port of the service - default 80 343 | - logLevel: How verbose the service logs to system log (0=DEBUG, 1=INFO, 2=ERROR) - default 1 344 | - debugHttp: Debug HTTP activity (true/false) - default false 345 | - debugJobFS: Debug remote mount activity (true/false) - default false 346 | - tokenSecret: Path to a file containing crypto-secret for access tokens 347 | - tokenTTL: Lifetime of access tokens before users have to re-authenticate (time-value) - default 1d 348 | - hashRounds: How many hash rounds access tokens are going through - default 10 349 | - endpoint: Head node's LXD HTTPS API endpoint 350 | - clientKey: Path to cryptographic key file for accessing head node's LXD endpoint 351 | - clientCert: Path to cryptographic certificate file for accessing head node's LXD endpoint 352 | - lxdTimeout: HTTP timeout in seconds for all LXD API access (time-value) - default 10s 353 | - lxdBridge: Bridge name of the network bridge that each container should connect its first NIC with - default lxdbr0 354 | - lxdDomain: Domain name for all containers - default lxd 355 | - containerTimeout: Timeout for LXD container state change - default 30s 356 | - pollInterval: Polling interval for checking LXD container states in ms - default 1000 357 | - maxParallelPrep: Maximum number of parallel job preparations - default 2 358 | - maxPrepDuration: Timeout for preparation phase (time-value) - default 1h 359 | - maxStartDuration: Timeout for start phase (time-value) - default 5m 360 | - mountRoot: Path to data-root on head-node - default /snakepit 361 | - queryLimit: Maximum number of returned list entries per user query 362 | 363 | ## Managing data 364 | 365 | There are four different data domains in Snakepit. 366 | All of them are represented by certain sub-directories within the data-root directory. 367 | Jobs have the same read/write rights as their owning users. 368 | 369 | * Shared data: `/shared/` - Files in this directory are read-only for everyone and considered public. 370 | Only users with direct access to the head-node can change its contents. 371 | * Group data: `/groups//` - Admins and all members of the given group have read/write access to all contents. 372 | * User data: `/home//` - Admins and the given user have read-write access. 373 | * Job data: `/pits//` - Admins, the owning user and group members of groups specified in the "groups" property of the job have read-access. Only the running job is allowed to write data. 374 | 375 | `/cache/` contains all cached git clones. 376 | 377 | `/db.json` is the database of the snakepit service. 378 | 379 | ## Troubleshooting 380 | 381 | The snakepit service is running as a regular systemd service (named "snakepit") inside the snakepit container. 382 | So you can control it through `systemctl` and monitor it through `journalctl`. 383 | 384 | In case of a tough problem you can also stop the systemd service and run snakepit manually: 385 | ``` 386 | $ sudo lxc exec snakepit -- bash 387 | root@snakepit:~# systemctl stop snakepit 388 | root@snakepit:~# cd /code 389 | root@snakepit:/code# npm start 390 | 391 | > snakepit@0.0.1 start /code 392 | > node src/service.js 393 | 394 | get https://...:8443/1.0 395 | state head 1 396 | state n0 1 397 | get https://...:8443/1.0/containers 398 | pitReport [] 399 | 'Snakepit service running on 0.0.0.0:80' 400 | [...] 401 | ``` 402 | With configuration `logLevel: 0` this should give you a good start for figuring out what's going on. 403 | 404 | To get a better understanding of how a running job/pit looks like from LXD's perspective, 405 | you could list the running containers: 406 | 407 | ``` 408 | $ sudo lxc list 409 | +---------------+---------+-----------------------+--------------------------+------------+-----------+ 410 | | NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS | 411 | +---------------+---------+-----------------------+--------------------------+------------+-----------+ 412 | | snakepit | RUNNING | 192.168.... (eth0) | fd42:... (eth0) | PERSISTENT | | 413 | +---------------+---------+-----------------------+--------------------------+------------+-----------+ 414 | | sp-head-191-d | RUNNING | 10.125.... (eth0) | fd42:... (eth0) | PERSISTENT | | 415 | +---------------+---------+-----------------------+--------------------------+------------+-----------+ 416 | | sp-n0-191-0 | RUNNING | 10.125.... (eth0) | fd42:... (eth0) | PERSISTENT | | 417 | +---------------+---------+-----------------------+--------------------------+------------+-----------+ 418 | ``` 419 | 420 | As you can see, a Snakepit container name (with the exception of Snakepit's service container) consists of the following parts (in given order): 421 | * "sp-": The common prefix allows using LXD for other purposes than Snakepit as long as containers are prefixed differently. 422 | * "<node-name>-": The name of the node the container runs on. This is required in case of double-adding a node for single-node setups and demos (like in this case). 423 | * "<pit-number>-": The pit number which is also the job number. 424 | * "<process-specifier>": "d" in case of the pit's daemon and the process/worker index in case of a worker process. 425 | 426 | ## Help 427 | 428 | 1. [**IRC**](https://wiki.mozilla.org/IRC) - You can contact us on the `#machinelearning` channel on [Mozilla IRC](https://wiki.mozilla.org/IRC); people there can try to answer/help 429 | 430 | 2. [**Issues**](https://github.com/mozilla/snakepit/issues) - If you think you ran into a serious problem, feel free to open an issue in our repo. 431 | -------------------------------------------------------------------------------- /assets/Snakepit_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/snakepit/0b6c1a4ff9fd9d58643b6c0c2615b7e934c257bb/assets/Snakepit_overview.png -------------------------------------------------------------------------------- /bin/clean-images.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | lxc delete --force snakepit-worker 4 | lxc delete --force snakepit-daemon 5 | lxc image delete snakepit-worker 6 | lxc image delete snakepit-daemon -------------------------------------------------------------------------------- /bin/clean-service.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | lxc delete --force snakepit 4 | lxc image delete snakepit 5 | -------------------------------------------------------------------------------- /bin/db-cat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /code/bin/db-init.sh ]; then 3 | echo "This command should be run inside the snakepit container." 4 | exit 1 5 | fi 6 | 7 | pg_dump -U postgres snakepit 8 | -------------------------------------------------------------------------------- /bin/db-clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /code/bin/db-init.sh ]; then 3 | echo "This command should be run inside the snakepit container." 4 | exit 1 5 | fi 6 | 7 | /code/bin/db-drop.sh 8 | /code/bin/db-init.sh 9 | -------------------------------------------------------------------------------- /bin/db-drop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /code/bin/db-init.sh ]; then 3 | echo "This command should be run inside the snakepit container." 4 | exit 1 5 | fi 6 | 7 | echo "Dropping snakepit DB..." 8 | dropdb -U postgres snakepit 9 | echo "Done." 10 | -------------------------------------------------------------------------------- /bin/db-dump.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /code/bin/db-init.sh ]; then 3 | echo "This command should be run inside the snakepit container." 4 | exit 1 5 | fi 6 | if [ "$#" -ne 1 ]; then 7 | echo "Usage: db-dump.sh some-dump-file.sql" 8 | exit 1 9 | fi 10 | 11 | echo "Writing snakepit DB to dump file $1..." 12 | pg_dump -U postgres snakepit > $1 13 | echo "Done." 14 | -------------------------------------------------------------------------------- /bin/db-init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /code/bin/db-init.sh ]; then 3 | echo "This command should be run inside the snakepit container." 4 | exit 1 5 | fi 6 | 7 | echo "Creating snakepit DB..." 8 | createdb -U postgres snakepit 9 | echo "Done." -------------------------------------------------------------------------------- /bin/db-restore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /code/bin/db-init.sh ]; then 3 | echo "This command should be run inside the snakepit container." 4 | exit 1 5 | fi 6 | if [ "$#" -ne 1 ]; then 7 | echo "Usage: db-restore.sh some-dump-file.sql" 8 | exit 1 9 | fi 10 | 11 | /code/bin/db-drop.sh 12 | echo "Restoring snakepit DB from dump file $1..." 13 | psql -U postgres snakepit < $1 14 | echo "Done." 15 | -------------------------------------------------------------------------------- /bin/prepare-directories.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ] ; then 4 | echo "Usage: prepare-directories.sh " 5 | exit 1 6 | fi 7 | 8 | owner=$1 9 | root_dir=$2 10 | 11 | if ! id "$owner" >/dev/null 2>&1; then 12 | echo "Unknown user: $owner" 13 | exit 1 14 | fi 15 | 16 | mkdir -p "$root_dir" 17 | mkdir -p "$root_dir/shared" 18 | mkdir -p "$root_dir/home" 19 | mkdir -p "$root_dir/groups" 20 | mkdir -p "$root_dir/pits" 21 | mkdir -p "$root_dir/trash" 22 | 23 | chown -R $owner:$owner "$root_dir" 24 | chmod -R 700 "$root_dir" 25 | -------------------------------------------------------------------------------- /bin/prepare-images.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | roles=(daemon worker) 4 | 5 | print_header () { 6 | printf "\n>>>>>>>> $1 <<<<<<<<\n\n" 7 | } 8 | 9 | print_header "Configuring image source" 10 | bin/prepare-lxd.sh 11 | 12 | for role in "${roles[@]}"; do 13 | print_header "Creating ${role} image" 14 | lxc init ubuntu-minimal:18.04/amd64 snakepit-${role} 15 | lxc start snakepit-${role} 16 | exe="lxc exec snakepit-${role} -- " 17 | sleep 2 18 | $exe systemctl isolate multi-user.target 19 | 20 | print_header "Starting ${role} setup" 21 | tar cf - -C scripts/${role} . | lxc exec snakepit-${role} -- tar xvf - --no-same-owner -C /root 22 | $exe bash /root/setup.sh 23 | done 24 | -------------------------------------------------------------------------------- /bin/prepare-lxd.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | if [ $(lxc remote list | grep ubuntu-minimal | wc -l) -gt "0" ]; then 5 | echo "Remote ubuntu-minimal already configured - skipping..." 6 | else 7 | echo "Adding remote ubuntu-minimal..." 8 | lxc remote add --protocol simplestreams ubuntu-minimal https://cloud-images.ubuntu.com/minimal/releases/ 9 | fi -------------------------------------------------------------------------------- /bin/prepare-service.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | if [ $# -ne 1 ] && [ $# -ne 2 ] ; then 4 | echo "Usage: prepare-service-container.sh [code-path]" 5 | exit 1 6 | fi 7 | 8 | print_header () { 9 | printf "\n>>>>>>>> $1 <<<<<<<<\n\n" 10 | } 11 | 12 | print_header "Configuring image source" 13 | bin/prepare-lxd.sh 14 | 15 | lxc init ubuntu-minimal:18.04/amd64 snakepit 16 | 17 | print_header "Configuring virtual drives" 18 | uid=`ls -ldn "$1" | awk '{print $3}'` 19 | lxc config set snakepit raw.idmap "both $uid 0" 20 | lxc config device add snakepit data disk path=/data source="$1" 21 | if [ $# -eq 2 ]; then 22 | lxc config device add snakepit code disk path=/code source="$2" 23 | fi 24 | 25 | print_header "Starting image..." 26 | lxc start snakepit 27 | exe="lxc exec snakepit -- " 28 | $exe systemctl isolate multi-user.target 29 | 30 | print_header "Installing dependencies..." 31 | $exe bash -c 'DEBIAN_FRONTEND=noninteractive apt-get -yq update && \ 32 | apt-get install -yq curl jq moreutils nodejs npm git git-lfs build-essential vim iputils-ping postgresql postgresql-contrib && \ 33 | git lfs install' 34 | 35 | if [ $# -ne 2 ]; then 36 | print_header "Cloning snakepit code base" 37 | $exe bash -c 'git clone https://github.com/mozilla/snakepit.git /code; cd /code; npm install' 38 | fi 39 | 40 | print_header "Getting endpoint address" 41 | if ! lxc network show lxdbr0 > /dev/null 2>&1; then 42 | lxc network create lxdbr0 43 | lxc network attach lxdbr0 snakepit default eth0 44 | fi 45 | address=`lxc network get lxdbr0 ipv4.address` 46 | address="`echo "$address" | cut -d/ -f 1`" 47 | endpoint="https://${address}:8443" 48 | echo "Using endpoint: $endpoint" 49 | 50 | print_header "Configuring service..." 51 | $exe /code/scripts/setup-service.sh "$endpoint" "$1" "$uid" 52 | -------------------------------------------------------------------------------- /bin/publish-images.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | roles=(daemon worker) 4 | 5 | print_header () { 6 | printf "\n>>>>>>>> $1 <<<<<<<<\n\n" 7 | } 8 | 9 | for role in "${roles[@]}"; do 10 | print_header "Publishing ${role}..." 11 | lxc stop snakepit-${role} || true 12 | lxc image delete snakepit-${role} || true 13 | lxc publish --public snakepit-${role} --alias snakepit-${role} || true 14 | done 15 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "snakepit", 3 | "version": "0.0.1", 4 | "description": "Machine learning job scheduler", 5 | "bin": { 6 | "pit": "src/service.js" 7 | }, 8 | "scripts": { 9 | "start": "node src/service.js", 10 | "postinstall": "node node_modules/pegjs/bin/pegjs src/clusterParser.pegjs" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git+https://github.com/mozilla/snakepit.git" 15 | }, 16 | "keywords": [ 17 | "machine", 18 | "learning", 19 | "job", 20 | "scheduler", 21 | "daemon", 22 | "cli", 23 | "cuda", 24 | "gpu", 25 | "cluster", 26 | "worker", 27 | "node", 28 | "js" 29 | ], 30 | "author": "Tilman Kamp", 31 | "license": "MPL-2.0", 32 | "bugs": { 33 | "url": "https://github.com/mozilla/snakepit/issues" 34 | }, 35 | "homepage": "https://github.com/mozilla/snakepit#readme", 36 | "dependencies": { 37 | "assign-deep": "^1.0.1", 38 | "async": "^2.6.1", 39 | "async-parallel": "^1.2.3", 40 | "axios": "^0.18.1", 41 | "bcrypt": "^3.0.6", 42 | "body-parser": "^1.18.3", 43 | "buffer-serializer": "^1.1.0", 44 | "commander": "^2.14.1", 45 | "express": "^4.16.2", 46 | "express-promise-router": "^3.0.3", 47 | "fs-extra": "^7.0.1", 48 | "js-yaml": "^3.12.0", 49 | "jsonwebtoken": "^8.2.2", 50 | "morgan": "^1.9.0", 51 | "multi-integer-range": "^4.0.4", 52 | "parse-duration": "^0.1.1", 53 | "pegjs": "^0.10.0", 54 | "pg": "^7.7.1", 55 | "range-parser": "^1.2.0", 56 | "sequelize": ">=5.3.0", 57 | "tail": "^2.0.2", 58 | "uuid": "^8.3.0", 59 | "ws": "^6.2.0" 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /scripts/clean.sh: -------------------------------------------------------------------------------- 1 | set -o pipefail 2 | ( 3 | echo "Cleaning started..." 4 | set -x 5 | rm -rf "$JOB_DIR/tmp" 6 | rm -rf "$JOB_DIR/src" 7 | echo "Cleaning done." 8 | ) 2>&1 | ts '[%Y-%m-%d %H:%M:%S] [clean]' >>"$JOB_DIR/pit.log" -------------------------------------------------------------------------------- /scripts/daemon/data-ro.mount: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Read-only data directory 3 | 4 | [Mount] 5 | What=/ro 6 | Where=/data/ro 7 | Type=fuse.bindfs 8 | Options=ro 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /scripts/daemon/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$HOSTNAME" = snakepit-daemon ]; then 4 | exit 0 5 | fi 6 | 7 | pit_root="/data/rw/pit" 8 | touch ${pit_root}/daemon.log 9 | print_log () { 10 | echo "[daemon] $1" >>${pit_root}/daemon.log 11 | } 12 | 13 | if [ -f "${pit_root}/run" ]; then 14 | print_log "This pit already ran. Requesting stop..." 15 | touch "${pit_root}/stop" 16 | fi 17 | touch "${pit_root}/run" 18 | 19 | for worker_dir in ${pit_root}/workers/*/ ; do 20 | worker_dir=${worker_dir%*/} 21 | touch "${worker_dir}/worker.log" 22 | chown -R worker:worker "${worker_dir}" 23 | done 24 | 25 | tail -F -q ${pit_root}/daemon.log ${pit_root}/workers/**/worker.log | ts '[%Y-%m-%d %H:%M:%S]' >>${pit_root}/pit.log & 26 | 27 | print_log "Pit daemon started" 28 | while true; do 29 | for worker_dir in ${pit_root}/workers/*/ ; do 30 | worker_dir=${worker_dir%*/} 31 | worker_index=${worker_dir##*/} 32 | if [ -f "${worker_dir}/stop" ]; then 33 | print_log "Worker ${worker_index} requested stop. Stopping pit..." 34 | touch "${pit_root}/stop" 35 | poweroff 36 | fi 37 | done 38 | sleep 1 39 | done 40 | -------------------------------------------------------------------------------- /scripts/daemon/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | aptget() { 5 | DEBIAN_FRONTEND=noninteractive apt-get -yq "$@" 6 | } 7 | 8 | print_header () { 9 | printf "\n>>>>>>>> $1 <<<<<<<<\n\n" 10 | } 11 | 12 | mkdir /ro 13 | mkdir /data 14 | mkdir /data/ro 15 | mkdir /data/rw 16 | 17 | print_header "Installing dependencies" 18 | aptget update 19 | aptget install dhcpcd5 openssh-server vim iputils-ping moreutils bindfs 20 | 21 | print_header "Creating user worker" 22 | useradd -m -s /usr/sbin/nologin -u 2525 worker 23 | password=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1) 24 | echo "worker:${password}" | chpasswd 25 | mkdir -p /home/worker/.ssh 26 | chown worker:worker /home/worker/.ssh 27 | 28 | print_header "Installing read-only mount" 29 | mv data-ro.mount /lib/systemd/system/ 30 | systemctl enable data-ro.mount 31 | 32 | print_header "Configuring ssh-daemon" 33 | mv /root/sshd_config /etc/ssh/sshd_config 34 | 35 | print_header "Disabling cloud configuration" 36 | echo 'network: {config: disabled}' >/etc/cloud/cloud.cfg.d/99-disable-network-config.cfg 37 | 38 | print_header "Installing daemon service" 39 | mv run.sh /usr/bin/run.sh 40 | mv snakepit.service /lib/systemd/system/ 41 | systemctl enable snakepit 42 | -------------------------------------------------------------------------------- /scripts/daemon/snakepit.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=snakepit-daemon - Machine learning job scheduler daemon service 3 | Documentation=https://github.com/mozilla/snakepit 4 | After=network.target 5 | 6 | [Service] 7 | User=worker 8 | WorkingDirectory=/data 9 | Type=simple 10 | ExecStart=/usr/bin/run.sh 11 | 12 | [Install] 13 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /scripts/daemon/sshd_config: -------------------------------------------------------------------------------- 1 | PasswordAuthentication no 2 | ChallengeResponseAuthentication no 3 | AllowAgentForwarding no 4 | AllowTcpForwarding no 5 | X11Forwarding no 6 | Subsystem sftp internal-sftp 7 | Match User worker 8 | ChrootDirectory /data 9 | ForceCommand internal-sftp 10 | -------------------------------------------------------------------------------- /scripts/keygen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | tmpd=$(mktemp -d) 5 | mkfifo "${tmpd}/key" "${tmpd}/key.pub" 6 | ( 7 | cat "${tmpd}/key" 8 | echo "-----BEGIN SSH-RSA PUBLIC KEY-----" 9 | cat "${tmpd}/key.pub" 10 | ) & 11 | echo "y" | ssh-keygen -q -N "" -f "${tmpd}/key" 2>/dev/null 1>/dev/null 12 | rm -rf "$tmpd" -------------------------------------------------------------------------------- /scripts/nodemon.service: -------------------------------------------------------------------------------- 1 | 2 | [Unit] 3 | Description=nodemon - Snakepit node resource monitor 4 | Documentation=https://github.com/mozilla/snakepit 5 | After=network.target 6 | 7 | [Service] 8 | Type=simple 9 | ExecStart=/bin/bash -c "while true; do { echo -e 'HTTP/1.1 200 OK\r\n'; nvidia-smi --query-gpu=pci.bus_id,temperature.gpu,utilization.gpu,utilization.memory --format=csv; } | nc -N -l 8444 2>&1 >/dev/null; done" 10 | Restart=on-failure 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /scripts/prepare.sh: -------------------------------------------------------------------------------- 1 | set -o pipefail 2 | ( 3 | echo "Preparation started..." 4 | 5 | set -ex 6 | set -o pipefail 7 | 8 | mkdir "$JOB_DIR/tmp" 9 | if [ -n "$CONTINUE_JOB_NUMBER" ]; then 10 | cp -r "$DATA_ROOT/pits/$CONTINUE_JOB_NUMBER/keep" "$JOB_DIR/keep" 11 | else 12 | mkdir "$JOB_DIR/keep" 13 | fi 14 | 15 | job_src_dir="$JOB_DIR/src" 16 | archive="$JOB_DIR/archive.tar.gz" 17 | 18 | if [ -f "$JOB_DIR/origin" ]; then 19 | echo "Git based" 20 | origin=$(<"$JOB_DIR/origin") 21 | git clone $origin "$job_src_dir" 22 | cd "$job_src_dir" 23 | if [ -f "$JOB_DIR/hash" ]; then 24 | hash=$(<"$JOB_DIR/hash") 25 | git reset --hard $hash 26 | fi 27 | git submodule update --recursive 28 | git lfs fetch 29 | git lfs checkout 30 | elif [ -f "$archive" ]; then 31 | echo "Archive based" 32 | mkdir "$job_src_dir" 33 | tar -xf "$archive" -C "$job_src_dir" 34 | else 35 | mkdir "$job_src_dir" 36 | fi 37 | 38 | cd "$job_src_dir" 39 | patch_file="$JOB_DIR/git.patch" 40 | if [ -f "$patch_file" ]; then 41 | cat "$patch_file" | patch -p0 42 | fi 43 | 44 | echo "Preparation done." 45 | ) 2>&1 | ts '[%Y-%m-%d %H:%M:%S] [prepare]' >>"$JOB_DIR/pit.log" 46 | -------------------------------------------------------------------------------- /scripts/scan.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo "Scanning node for resources..." 4 | if [ -d /proc/driver/nvidia/gpus ]; then 5 | cuda_index=0 6 | for gpu_dir in /proc/driver/nvidia/gpus/*/ ; do 7 | model=`awk -F: '/Model/{gsub(/^[ \t]+/, "", $2);gsub(/[,]/, " ", $2);print $2}' "${gpu_dir}/information"` 8 | echo "resource:cuda,${cuda_index},${model}" >>"$RESULT_FILE" 9 | ((cuda_index++)) 10 | done 11 | echo "Found ${cuda_index} CUDA device(s)" 12 | fi 13 | echo "Node scanning done" 14 | -------------------------------------------------------------------------------- /scripts/setup-service.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | lxd_endpoint=$1 5 | mount_root=$2 6 | uid=$3 7 | 8 | command="sudo lxc exec snakepit -- /code/scripts/setup-service.sh $lxd_endpoint \"$mount_root\" $uid" 9 | if ! curl -k -s $lxd_endpoint/1.0 > /dev/null 2>&1; then 10 | echo "Problem accessing \"$lxd_endpoint\"." 11 | echo "Please call" 12 | echo $command 13 | echo "with the appropriate LXD REST service endpoint." 14 | exit 1 15 | fi 16 | 17 | config_dir="/etc/snakepit" 18 | mkdir -p "$config_dir" 19 | 20 | openssl req -new -newkey rsa:4096 -days 3650 -nodes -x509 \ 21 | -subj "/C=US/ST=California/L=San Francisco/O=Mozilla/CN=mozilla.com" \ 22 | -keyout "$config_dir/lxd.key" \ 23 | -out "$config_dir/lxd.crt" \ 24 | > /dev/null 2>&1 25 | echo -n "Local LXD trust password: " 26 | read -s password 27 | echo "" 28 | json=$(curl \ 29 | -s \ 30 | -k \ 31 | --cert "$config_dir/lxd.crt" \ 32 | --key "$config_dir/lxd.key" \ 33 | $lxd_endpoint/1.0/certificates \ 34 | -X POST \ 35 | -d "{\"type\": \"client\", \"password\": \"$password\"}" \ 36 | ) 37 | if [[ "`echo $json | jq '.status_code'`" -ne "200" ]]; then 38 | echo "Problem authenticating at \"$lxd_endpoint\". Please call" 39 | echo $command 40 | echo "again and provide the correct password." 41 | exit 2 42 | fi 43 | 44 | echo "Successfully authenticated snakepit service at local LXD endpoint." 45 | 46 | token_secret_path="$config_dir/token-secret.txt" 47 | touch "$token_secret_path" 48 | chmod 600 "$token_secret_path" 49 | openssl rand -base64 32 >"$token_secret_path" 50 | 51 | conf="$config_dir/snakepit.conf" 52 | touch "$conf" 53 | chmod 644 "$conf" 54 | echo -e "# LXD REST URL.\nendpoint: \"$lxd_endpoint\"\n" >>$conf 55 | echo -e "# LXD REST client certificate file.\nclientCert: \"$config_dir/lxd.crt\"\n" >>$conf 56 | echo -e "# LXD REST client key file.\nclientKey: \"$config_dir/lxd.key\"\n" >>$conf 57 | echo -e "# External path of LXD data drive (required for mounting).\nmountRoot: \"$mount_root\"\n" >>$conf 58 | echo -e "# UID of external user that should be mapped to worker/root user (required for write access on mounts).\nmountUid: \"$uid\"\n" >>$conf 59 | echo -e "# Path to session token secret file.\ntokenSecretPath: \"$token_secret_path\"\n" >>$conf 60 | 61 | echo -e "local all all trust\nhost all all 127.0.0.1/32 trust" >/etc/postgresql/10/main/pg_hba.conf 62 | systemctl restart postgresql.service 63 | /code/bin/db-init.sh 64 | 65 | if systemctl is-active --quiet snakepit; then 66 | systemctl stop snakepit 67 | fi 68 | cp /code/scripts/snakepit.service /lib/systemd/system/ 69 | systemctl enable snakepit 70 | reboot 71 | -------------------------------------------------------------------------------- /scripts/snakepit.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=snakepit - Machine learning job scheduler 3 | Documentation=https://github.com/mozilla/snakepit 4 | After=network.target 5 | 6 | [Service] 7 | Environment=NODE_ENV=production 8 | WorkingDirectory=/code 9 | Type=simple 10 | ExecStart=/usr/bin/node /code/src/service.js 11 | Restart=on-failure 12 | 13 | [Install] 14 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /scripts/worker/20auto-upgrades: -------------------------------------------------------------------------------- 1 | APT::Periodic::Update-Package-Lists "0"; 2 | APT::Periodic::Download-Upgradeable-Packages "0"; 3 | APT::Periodic::AutocleanInterval "0"; 4 | APT::Periodic::Unattended-Upgrade "0"; -------------------------------------------------------------------------------- /scripts/worker/apt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while fuser /var/lib/dpkg/lock >/dev/null 2>&1 ; do 3 | sleep 1 4 | done 5 | /usr/bin/apt "$@" -------------------------------------------------------------------------------- /scripts/worker/apt-get: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while fuser /var/lib/dpkg/lock >/dev/null 2>&1 ; do 3 | sleep 1 4 | done 5 | /usr/bin/apt-get "$@" -------------------------------------------------------------------------------- /scripts/worker/forwarder/forwarder.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const net = require('net') 3 | const multiplex = require('multiplex') 4 | 5 | const mp = multiplex((stream, id) => { 6 | let port = Number(id.split('-')[1]) 7 | let client = net.createConnection({ port: port }, () => { 8 | stream.pipe(client) 9 | client.pipe(stream) 10 | }) 11 | client.on('error', err => stream.destroy(err)) 12 | }) 13 | 14 | process.stdin.pipe(mp) 15 | mp.pipe(process.stdout) -------------------------------------------------------------------------------- /scripts/worker/forwarder/forwarder.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | node /opt/forwarder/forwarder.js 3 | -------------------------------------------------------------------------------- /scripts/worker/forwarder/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "forwarder", 3 | "version": "0.0.1", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "core-util-is": { 8 | "version": "1.0.2", 9 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", 10 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" 11 | }, 12 | "duplexify": { 13 | "version": "3.7.1", 14 | "resolved": "https://registry.npmjs.org/duplexify/-/duplexify-3.7.1.tgz", 15 | "integrity": "sha512-07z8uv2wMyS51kKhD1KsdXJg5WQ6t93RneqRxUHnskXVtlYYkLqM0gqStQZ3pj073g687jPCHrqNfCzawLYh5g==", 16 | "requires": { 17 | "end-of-stream": "1.4.1", 18 | "inherits": "2.0.3", 19 | "readable-stream": "2.3.6", 20 | "stream-shift": "1.0.0" 21 | } 22 | }, 23 | "end-of-stream": { 24 | "version": "1.4.1", 25 | "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", 26 | "integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==", 27 | "requires": { 28 | "once": "1.4.0" 29 | } 30 | }, 31 | "inherits": { 32 | "version": "2.0.3", 33 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", 34 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" 35 | }, 36 | "isarray": { 37 | "version": "1.0.0", 38 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", 39 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" 40 | }, 41 | "multiplex": { 42 | "version": "6.7.0", 43 | "resolved": "https://registry.npmjs.org/multiplex/-/multiplex-6.7.0.tgz", 44 | "integrity": "sha1-/3Pk5AB5FwxEQtFgllZY+N75YMI=", 45 | "requires": { 46 | "duplexify": "3.7.1", 47 | "inherits": "2.0.3", 48 | "readable-stream": "2.3.6", 49 | "varint": "4.0.1", 50 | "xtend": "4.0.1" 51 | } 52 | }, 53 | "once": { 54 | "version": "1.4.0", 55 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 56 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", 57 | "requires": { 58 | "wrappy": "1.0.2" 59 | } 60 | }, 61 | "process-nextick-args": { 62 | "version": "2.0.0", 63 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", 64 | "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" 65 | }, 66 | "readable-stream": { 67 | "version": "2.3.6", 68 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", 69 | "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", 70 | "requires": { 71 | "core-util-is": "1.0.2", 72 | "inherits": "2.0.3", 73 | "isarray": "1.0.0", 74 | "process-nextick-args": "2.0.0", 75 | "safe-buffer": "5.1.2", 76 | "string_decoder": "1.1.1", 77 | "util-deprecate": "1.0.2" 78 | } 79 | }, 80 | "safe-buffer": { 81 | "version": "5.1.2", 82 | "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", 83 | "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" 84 | }, 85 | "stream-shift": { 86 | "version": "1.0.0", 87 | "resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.0.tgz", 88 | "integrity": "sha1-1cdSgl5TZ+eG944Y5EXqIjoVWVI=" 89 | }, 90 | "string_decoder": { 91 | "version": "1.1.1", 92 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", 93 | "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", 94 | "requires": { 95 | "safe-buffer": "5.1.2" 96 | } 97 | }, 98 | "util-deprecate": { 99 | "version": "1.0.2", 100 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", 101 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" 102 | }, 103 | "varint": { 104 | "version": "4.0.1", 105 | "resolved": "https://registry.npmjs.org/varint/-/varint-4.0.1.tgz", 106 | "integrity": "sha1-SQgpuULSSEY7KzUJeZXDv3NxmOk=" 107 | }, 108 | "wrappy": { 109 | "version": "1.0.2", 110 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 111 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" 112 | }, 113 | "xtend": { 114 | "version": "4.0.1", 115 | "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.1.tgz", 116 | "integrity": "sha1-pcbVMr5lbiPbgg77lDofBJmNY68=" 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /scripts/worker/forwarder/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "forwarder", 3 | "version": "0.0.1", 4 | "description": "Snakepit socket forwarding helper", 5 | "main": "forwarder.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "Tilman Kamp", 10 | "license": "MPL-2.0", 11 | "dependencies": { 12 | "multiplex": "^6.7.0" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /scripts/worker/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$HOSTNAME" = snakepit-worker ]; then 4 | exit 0 5 | fi 6 | 7 | while [[ ! -f "/env.sh" ]]; do 8 | sleep 0.1 9 | done 10 | 11 | export DEBIAN_FRONTEND=noninteractive 12 | source "/etc/profile" 13 | source "/env.sh" 14 | 15 | mkdir /data 16 | worker_dir="/data/rw/pit/workers/${WORKER_INDEX}" 17 | 18 | i=0 19 | while [[ ! -d "${worker_dir}" ]]; do 20 | if [[ ${i} -gt 5 ]]; then 21 | reboot 22 | fi 23 | let i=i+1 24 | sleep 1 25 | done 26 | 27 | cd "${WORK_DIR}" 28 | export RESULT_FILE="${worker_dir}/result" 29 | 30 | log_file="${worker_dir}/worker.log" 31 | pipe_log () { 32 | stdbuf -oL awk '{print "[worker '${WORKER_INDEX}'] " $0}' >>"${log_file}" 33 | } 34 | print_log () { 35 | echo "$1" | pipe_log 36 | } 37 | 38 | print_log "Worker ${WORKER_INDEX} started" 39 | print_log "Preparing script execution..." 40 | apt-get update 2>&1 | pipe_log 41 | systemctl stop apt-daily.service 42 | systemctl kill --kill-who=all apt-daily.service 43 | while ! (systemctl list-units --all apt-daily.service | grep -qE 'dead|failed') ; do sleep 1; done 44 | sleep 10 45 | print_log "Starting script..." 46 | stdbuf -oL bash "/data/rw/pit/script.sh" 2>&1 | pipe_log 47 | exit_code=${PIPESTATUS[0]} 48 | echo "$exit_code" >"${worker_dir}/status" 49 | print_log "Worker ${WORKER_INDEX} ended with exit code ${exit_code}" 50 | touch "${worker_dir}/stop" 51 | poweroff 52 | -------------------------------------------------------------------------------- /scripts/worker/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | aptget() { 5 | DEBIAN_FRONTEND=noninteractive apt-get -yq "$@" 6 | } 7 | 8 | print_header () { 9 | printf "\n>>>>>>>> $1 <<<<<<<<\n\n" 10 | } 11 | 12 | print_header "Installing dependencies" 13 | # # Official Nvidia Ubuntu PPA 14 | # aptget update 15 | # aptget install software-properties-common 16 | # # !This image and your GPU nodes should feature the very same driver! 17 | # add-apt-repository -y ppa:graphics-drivers/ppa 18 | aptget update 19 | aptget install dhcpcd5 sshfs vim iputils-ping npm nfs-common 20 | # aptget install nvidia-driver-410 nvidia-utils-410 nvidia-cuda-toolkit 21 | 22 | print_header "Preparing apt" 23 | mv 20auto-upgrades /etc/apt/apt.conf.d/20auto-upgrades 24 | mv apt /usr/local/sbin/apt 25 | mv apt-get /usr/local/sbin/apt-get 26 | mv forwarder/forwarder.sh /usr/bin/forwarder.sh 27 | mv forwarder /opt/forwarder 28 | pushd /opt/forwarder 29 | npm install 30 | popd 31 | 32 | print_header "Disabling cloud configuration" 33 | echo 'network: {config: disabled}' >/etc/cloud/cloud.cfg.d/99-disable-network-config.cfg 34 | 35 | print_header "Preparing SSH" 36 | mkdir -p /root/.ssh 37 | systemctl disable sshd 38 | 39 | print_header "Installing worker service" 40 | mv run.sh /usr/bin/run.sh 41 | mv snakepit.service /lib/systemd/system/ 42 | systemctl enable snakepit.service 43 | -------------------------------------------------------------------------------- /scripts/worker/snakepit.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=snakepit-worker - Machine learning job scheduler worker service 3 | Documentation=https://github.com/mozilla/snakepit 4 | After=network.target 5 | 6 | [Service] 7 | User=root 8 | WorkingDirectory=/root 9 | Type=simple 10 | ExecStart=/usr/bin/run.sh 11 | 12 | [Install] 13 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /src/clusterParser.pegjs: -------------------------------------------------------------------------------- 1 | start 2 | = cluster 3 | 4 | cluster 5 | = left:processGroup "," right:cluster { return [left].concat(right); } 6 | / solo:processGroup { return [solo]; } 7 | 8 | processGroup 9 | = left:integer ":" right:process { return { count: left, process: right } } 10 | / solo:process { return { count: 1, process: solo } } 11 | 12 | process 13 | = "[" solo:resourceList "]" { return solo } 14 | / "[" "]" { return [] } 15 | 16 | resourceList 17 | = left:resourceGroup "," right:resourceList { return [left].concat(right) } 18 | / solo:resourceGroup { return [solo] } 19 | 20 | resourceGroup 21 | = left:integer ":" right:resource { return { count: left, name: right } } 22 | / solo:resource { return { count: 1, name: solo } } 23 | 24 | resource 25 | = chars:[a-zA-Z]+ alpha:[a-zA-Z0-9]* { return chars.join("") + alpha.join("") } 26 | 27 | integer 28 | = digits:[0-9]+ { return parseInt(digits.join(""), 10) } 29 | -------------------------------------------------------------------------------- /src/config.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | const yaml = require('js-yaml') 4 | const parseDuration = require('parse-duration') 5 | 6 | const oneSecond = 1000 7 | const oneMinute = 60 * oneSecond 8 | const oneHour = 60 * oneMinute 9 | const oneDay = 24 * oneHour 10 | 11 | var filename = process.env.SNAKEPIT_CONF || '/etc/snakepit/snakepit.conf' 12 | if (!fs.existsSync(filename)) { 13 | if (process.env.HOME) { 14 | filename = path.join(process.env.HOME, '.snakepit', 'snakepit.conf') 15 | } else { 16 | filename = path.join('config', 'snakepit.conf') 17 | } 18 | } 19 | 20 | function tryConfigFile(fun, verb) { 21 | try { 22 | return fun() 23 | } catch (err) { 24 | console.error('Problem ' + verb + ' config file "' + filename + '"') 25 | process.exit(1) 26 | } 27 | } 28 | 29 | var content = tryConfigFile(() => fs.readFileSync(filename), 'reading') 30 | var config = module.exports = tryConfigFile(() => yaml.safeLoad(content), 'parsing') 31 | 32 | function readConfigFile(name, mandatory) { 33 | if (fs.existsSync(config[name])) { 34 | return fs.readFileSync(config[name]).toString() 35 | } else if (mandatory) { 36 | throw new Error('Unable to read mandatory config file: ' + name) 37 | } 38 | } 39 | 40 | config.interface = process.env.SNAKEPIT_INTERFACE || config.interface || '0.0.0.0' 41 | config.port = process.env.SNAKEPIT_PORT || config.port || 80 42 | 43 | config.logLevel = typeof config.logLevel === 'undefined' ? 1 : Number(config.logLevel) 44 | config.debugHttp = process.env.SNAKEPIT_DEBUG_HTTP || config.debugHttp 45 | config.debugJobFS = process.env.SNAKEPIT_DEBUG_JOBFS || config.debugJobFS 46 | 47 | config.tokenSecret = readConfigFile('tokenSecretPath', true) 48 | config.tokenTTL = parseDuration(config.tokenTTL || '1d') 49 | config.hashRounds = config.hashRounds || 10 50 | 51 | if (!config.endpoint) { 52 | throw new Error('Missing field: endpoint') 53 | } 54 | config.clientKey = readConfigFile('clientKey', true) 55 | config.clientCert = readConfigFile('clientCert', true) 56 | 57 | config.pollInterval = config.pollInterval ? Number(config.pollInterval) : oneSecond 58 | config.maxParallelPrep = config.maxParallelPrep ? Number(config.maxParallelPrep) : 2 59 | config.maxPrepDuration = parseDuration(config.maxPrepDuration || '1h') 60 | config.maxStartDuration = parseDuration(config.maxStartDuration || '5m') 61 | config.containerTimeout = parseDuration(config.containerTimeout || '20s') 62 | config.lxdTimeout = parseDuration(config.lxdTimeout || '5s') 63 | config.lxdBridge = config.lxdBridge || 'lxdbr0' 64 | config.lxdDomain = config.lxdDomain || 'lxd' 65 | 66 | config.mountRoot = config.mountRoot || '/snakepit' 67 | 68 | config.queryLimit = config.queryLimit || 100 69 | -------------------------------------------------------------------------------- /src/models/Alias-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | 4 | var Alias = sequelize.define('alias', { 5 | id: { type: Sequelize.STRING, allowNull: false, primaryKey: true }, 6 | name: { type: Sequelize.STRING, allowNull: false, unique: true } 7 | }) 8 | 9 | Alias.getAlias = async name => { 10 | let entry = await Alias.findOne({ where: { name: name } }) 11 | return entry && entry.id 12 | } 13 | 14 | module.exports = Alias 15 | -------------------------------------------------------------------------------- /src/models/Allocation-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | 4 | var Allocation = sequelize.define('allocation', { 5 | id: { type: Sequelize.INTEGER, autoIncrement: true, primaryKey: true }, 6 | cmemory: { type: Sequelize.DOUBLE, allowNull: true }, 7 | amemory: { type: Sequelize.DOUBLE, allowNull: true }, 8 | ccompute: { type: Sequelize.DOUBLE, allowNull: true }, 9 | acompute: { type: Sequelize.DOUBLE, allowNull: true }, 10 | samples: { type: Sequelize.INTEGER, defaultValue: 0 } 11 | }) 12 | 13 | module.exports = Allocation 14 | -------------------------------------------------------------------------------- /src/models/Group-model.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs-extra') 2 | const { v4: uuidv4 } = require('uuid') 3 | const path = require('path') 4 | const Sequelize = require('sequelize') 5 | const sequelize = require('./db.js') 6 | const config = require('../config.js') 7 | const log = require('../utils/logger.js') 8 | 9 | var Group = sequelize.define('group', { 10 | id: { type: Sequelize.STRING, allowNull: false, primaryKey: true }, 11 | title: { type: Sequelize.STRING, allowNull: true } 12 | }) 13 | 14 | const groupPrefix = '/data/groups/' 15 | 16 | Group.afterCreate(async group => { 17 | let groupDir = groupPrefix + group.id 18 | if (!(await fs.pathExists(groupDir))) { 19 | await fs.mkdirp(groupDir) 20 | } 21 | }) 22 | 23 | Group.afterDestroy(async group => { 24 | let groupDir = groupPrefix + group.id 25 | if (await fs.pathExists(groupDir)) { 26 | await fs.move(groupDir, '/data/trash/' + uuidv4()) 27 | } 28 | }) 29 | 30 | Group.getDir = (groupId) => groupPrefix + groupId 31 | Group.prototype.getDir = function () { 32 | return Group.getDir(this.id) 33 | } 34 | 35 | Group.getDirExternal = (groupId) => path.join(config.mountRoot, 'groups', groupId + '') 36 | Group.prototype.getDirExternal = function () { 37 | return Group.getDirExternal(this.id) 38 | } 39 | 40 | module.exports = Group 41 | -------------------------------------------------------------------------------- /src/models/Job-model.js: -------------------------------------------------------------------------------- 1 | const assign = require('assign-deep') 2 | const Sequelize = require('sequelize') 3 | const sequelize = require('./db.js') 4 | const Pit = require('./Pit-model.js') 5 | const Group = require('./Group-model.js') 6 | const User = require('./User-model.js') 7 | const State = require('./State-model.js') 8 | const ProcessGroup = require('./ProcessGroup-model.js') 9 | const Process = require('./Process-model.js') 10 | const Allocation = require('./Allocation-model.js') 11 | 12 | const log = require('../utils/logger.js') 13 | 14 | var Job = sequelize.define('job', { 15 | id: { type: Sequelize.INTEGER, primaryKey: true }, 16 | description: { type: Sequelize.STRING, allowNull: false }, 17 | provisioning: { type: Sequelize.STRING, allowNull: false }, 18 | request: { type: Sequelize.STRING, allowNull: false }, 19 | image: { type: Sequelize.STRING, allowNull: true }, 20 | state: { type: Sequelize.INTEGER, allowNull: true }, 21 | since: { type: Sequelize.DATE, allowNull: true }, 22 | rank: { type: Sequelize.INTEGER, allowNull: false, defaultValue: 0 }, 23 | allocation: { type: Sequelize.STRING, allowNull: true }, 24 | continues: { type: Sequelize.INTEGER, allowNull: true } 25 | }) 26 | 27 | Job.jobStates = { 28 | NEW: 0, 29 | PREPARING: 1, 30 | WAITING: 2, 31 | STARTING: 3, 32 | RUNNING: 4, 33 | STOPPING: 5, 34 | CLEANING: 6, 35 | DONE: 7 36 | } 37 | 38 | Job.hasMany(State, { onDelete: 'cascade' }) 39 | State.belongsTo(Job) 40 | 41 | Job.hasMany(ProcessGroup, { onDelete: 'cascade' }) 42 | ProcessGroup.belongsTo(Job) 43 | 44 | Job.belongsTo(Pit, { foreignKey: 'id', onDelete: 'cascade' }) 45 | 46 | Job.belongsTo(User, { constraints: false }) 47 | 48 | var JobGroup = Job.JobGroup = sequelize.define('jobgroup', { 49 | jobId: { type: Sequelize.INTEGER, unique: 'pk' }, 50 | groupId: { type: Sequelize.STRING, unique: 'pk' } 51 | }) 52 | Job.hasMany(JobGroup, { onDelete: 'cascade' }) 53 | Group.hasMany(JobGroup, { onDelete: 'cascade' }) 54 | JobGroup.belongsTo(Job) 55 | JobGroup.belongsTo(Group) 56 | 57 | User.prototype.canAccessJob = async function (job) { 58 | if (this.admin || job.userId == this.id) { 59 | return true 60 | } 61 | return await Job.findOne({ 62 | where: { id: job.id, '$jobgroups->group->usergroups.userId$': this.id }, 63 | include: [ 64 | { 65 | model: JobGroup, 66 | require: true, 67 | include: [ 68 | { 69 | model: Group, 70 | require: true, 71 | include: [ 72 | { 73 | model: User.UserGroup, 74 | require: true 75 | } 76 | ] 77 | } 78 | ] 79 | } 80 | ] 81 | }) 82 | } 83 | 84 | Job.getDir = (jobId) => Pit.getDir(jobId) 85 | Job.prototype.getDir = function () { 86 | return Pit.getDir(this.id) 87 | } 88 | 89 | Job.getDirExternal = (jobId) => Pit.getDirExternal(jobId) 90 | Job.prototype.getDirExternal = function () { 91 | return Pit.getDirExternal(this.id) 92 | } 93 | 94 | Job.prototype.setState = async function (state, reason) { 95 | if (this.state == state) { 96 | return 97 | } 98 | let t 99 | try { 100 | t = await sequelize.transaction({ type: Sequelize.Transaction.TYPES.EXCLUSIVE }) 101 | if (this.state != Job.jobStates.WAITING && state == Job.jobStates.WAITING) { 102 | this.rank = ((await Job.max('rank', { where: { state: Job.jobStates.WAITING } })) || 0) + 1 103 | } else if (this.state == Job.jobStates.WAITING && state != Job.jobStates.WAITING) { 104 | await Job.update( 105 | { rank: Sequelize.literal('rank - 1') }, 106 | { 107 | where: { 108 | state: Job.jobStates.WAITING, 109 | rank: { [Sequelize.Op.gt]: this.rank } 110 | }, 111 | transaction: t, 112 | lock: t.LOCK 113 | } 114 | ) 115 | this.rank = 0 116 | } 117 | this.state = state 118 | this.since = Date.now() 119 | await this.save({ transaction: t, lock: t.LOCK }) 120 | await State.create({ jobId: this.id, state: state, since: Date.now(), reason: reason }) 121 | await t.commit() 122 | } catch (err) { 123 | await t.rollback() 124 | throw err 125 | } 126 | } 127 | 128 | Job.infoQuery = options => assign({ 129 | subQuery: false, 130 | include: [ 131 | { 132 | model: ProcessGroup, 133 | require: false, 134 | attributes: [], 135 | include: [ 136 | { 137 | model: Process, 138 | require: false, 139 | attributes: [], 140 | include: 141 | [ 142 | { 143 | model: Allocation, 144 | require: false, 145 | attributes: [] 146 | } 147 | ] 148 | } 149 | ] 150 | } 151 | ], 152 | group: [ 153 | 'job.id' 154 | ], 155 | attributes: { 156 | include: [ 157 | [sequelize.fn('sum', sequelize.col('processgroups->processes->allocations.samples')), 'samples'], 158 | [sequelize.fn('sum', sequelize.col('processgroups->processes->allocations.acompute')), 'aggcompute'], 159 | [sequelize.fn('sum', sequelize.col('processgroups->processes->allocations.amemory')), 'aggmemory'], 160 | [sequelize.fn('avg', sequelize.col('processgroups->processes->allocations.ccompute')), 'curcompute'], 161 | [sequelize.fn('avg', sequelize.col('processgroups->processes->allocations.cmemory')), 'curmemory'] 162 | ] 163 | } 164 | }, options || {}) 165 | 166 | Allocation.activeQuery = { 167 | include: [ 168 | { 169 | model: Process, 170 | require: true, 171 | attributes: [], 172 | include: [ 173 | { 174 | model: ProcessGroup, 175 | require: true, 176 | attributes: [], 177 | include: [ 178 | { 179 | model: Job, 180 | require: true, 181 | attributes: [] 182 | } 183 | ] 184 | } 185 | ] 186 | } 187 | ], 188 | where: { 189 | '$process->processgroup->job.state$': { 190 | [Sequelize.Op.gte]: Job.jobStates.STARTING, 191 | [Sequelize.Op.lte]: Job.jobStates.STOPPING 192 | } 193 | } 194 | } 195 | 196 | module.exports = Job 197 | -------------------------------------------------------------------------------- /src/models/Node-model.js: -------------------------------------------------------------------------------- 1 | const Parallel = require('async-parallel') 2 | const Sequelize = require('sequelize') 3 | const sequelize = require('./db.js') 4 | const Resource = require('./Resource-model.js') 5 | const lxd = require('../utils/lxd.js') 6 | const config = require('../config.js') 7 | 8 | var Node = sequelize.define('node', { 9 | id: { type: Sequelize.STRING, primaryKey: true }, 10 | endpoint: { type: Sequelize.STRING, allowNull: false }, 11 | password: { type: Sequelize.STRING, allowNull: true }, 12 | online: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }, 13 | available: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }, 14 | since: { type: Sequelize.DATE, allowNull: false } 15 | }) 16 | 17 | Node.hasMany(Resource, { onDelete: 'cascade' }) 18 | Resource.belongsTo(Node) 19 | 20 | Node.beforeCreate(async node => { 21 | if (!(await Node.findOne({ where: { endpoint: node.endpoint } }))) { 22 | try { 23 | await lxd.post(node.endpoint, 'certificates', { type: 'client', password: node.password }) 24 | } catch (ex) { 25 | if (!ex.response || !ex.response.data || !ex.response.data.error || 26 | ex.response.data.error != 'Certificate already in trust store') { 27 | throw ex 28 | } 29 | } 30 | } 31 | delete node.password 32 | }) 33 | 34 | Node.afterDestroy(async node => { 35 | if (node.endpoint != config.endpoint) { 36 | let certificates = await lxd.get(node.endpoint, 'certificates') 37 | certificates = certificates.map(c => { 38 | c = c.split('/') 39 | return c[c.length - 1] 40 | }) 41 | await Parallel.each(certificates, async c => { 42 | let cpath = 'certificates/' + c 43 | let cinfo = await lxd.get(node.endpoint, cpath) 44 | if (cinfo.certificate == config.lxdCert) { 45 | await lxd.delete(node.endpoint, cpath) 46 | } 47 | }) 48 | } 49 | }) 50 | 51 | module.exports = Node 52 | -------------------------------------------------------------------------------- /src/models/Pit-model.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | const Sequelize = require('sequelize') 3 | const sequelize = require('./db.js') 4 | const config = require('../config.js') 5 | const { v4: uuidv4 } = require('uuid') 6 | 7 | const fs = require('fs-extra') 8 | 9 | var Pit = sequelize.define('pit', { 10 | id: { type: Sequelize.INTEGER, autoIncrement: true, primaryKey: true } 11 | }) 12 | 13 | const pitPrefix = '/data/pits/' 14 | 15 | Pit.afterCreate(async pit => { 16 | let pitDir = pitPrefix + pit.id 17 | if (!(await fs.pathExists(pitDir))) { 18 | await fs.mkdirp(pitDir) 19 | } 20 | }) 21 | 22 | Pit.afterDestroy(async pit => { 23 | let pitDir = pitPrefix + pit.id 24 | if (await fs.pathExists(pitDir)) { 25 | await fs.move(pitDir, '/data/trash/' + uuidv4()) 26 | } 27 | }) 28 | 29 | Pit.getDir = (pitId) => pitPrefix + pitId 30 | Pit.prototype.getDir = function () { 31 | return Pit.getDir(this.id) 32 | } 33 | 34 | Pit.getDirExternal = (pitId) => path.join(config.mountRoot, 'pits', pitId + '') 35 | Pit.prototype.getDirExternal = function () { 36 | return Pit.getDirExternal(this.id) 37 | } 38 | 39 | module.exports = Pit 40 | -------------------------------------------------------------------------------- /src/models/Process-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | const Allocation = require('./Allocation-model.js') 4 | const Node = require('./Node-model.js') 5 | 6 | var Process = sequelize.define('process', { 7 | id: { type: Sequelize.INTEGER, autoIncrement: true, primaryKey: true }, 8 | index: { type: Sequelize.INTEGER, allowNull: false }, 9 | status: { type: Sequelize.INTEGER, allowNull: true }, 10 | result: { type: Sequelize.STRING, allowNull: true } 11 | }) 12 | 13 | Process.hasMany(Allocation, { onDelete: 'cascade' }) 14 | Allocation.belongsTo(Process) 15 | 16 | Node.hasMany(Process, { onDelete: 'no action', constraints: false }) 17 | Process.belongsTo(Node, { constraints: false }) 18 | 19 | module.exports = Process 20 | -------------------------------------------------------------------------------- /src/models/ProcessGroup-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | const Process = require('./Process-model.js') 4 | 5 | var ProcessGroup = sequelize.define('processgroup', { 6 | id: { type: Sequelize.INTEGER, autoIncrement: true, primaryKey: true }, 7 | index: { type: Sequelize.INTEGER, allowNull: false } 8 | }) 9 | 10 | ProcessGroup.hasMany(Process, { onDelete: 'cascade' }) 11 | Process.belongsTo(ProcessGroup) 12 | 13 | module.exports = ProcessGroup 14 | -------------------------------------------------------------------------------- /src/models/Resource-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | const Alias = require('./Alias-model.js') 4 | const Allocation = require('./Allocation-model.js') 5 | const Group = require('./Group-model.js') 6 | const User = require('./User-model.js') 7 | 8 | var Resource = sequelize.define('resource', { 9 | id: { type: Sequelize.INTEGER, autoIncrement: true, primaryKey: true }, 10 | type: { type: Sequelize.STRING, allowNull: false }, 11 | index: { type: Sequelize.INTEGER, allowNull: false }, 12 | name: { type: Sequelize.STRING, allowNull: false } 13 | }) 14 | 15 | Resource.hasMany(Allocation) 16 | Allocation.belongsTo(Resource) 17 | 18 | Resource.belongsTo(Alias, { constraints: false, foreignKey: 'name', targetKey: 'name' }) 19 | //Alias.belongsTo(Resource, { foreignKey: 'name', targetKey: 'name' }) 20 | 21 | var ResourceGroup = Resource.ResourceGroup = sequelize.define('resourcegroup', { 22 | resourceId: { type: Sequelize.INTEGER, unique: 'pk' }, 23 | groupId: { type: Sequelize.STRING, unique: 'pk' } 24 | }) 25 | Resource.hasMany(ResourceGroup, { onDelete: 'cascade' }) 26 | Group.hasMany(ResourceGroup, { onDelete: 'cascade' }) 27 | ResourceGroup.belongsTo(Resource) 28 | ResourceGroup.belongsTo(Group) 29 | 30 | User.prototype.canAccessResource = async function (resource) { 31 | if (this.admin) { 32 | return true 33 | } 34 | return await Resource.findOne({ 35 | where: { id: resource.id, '$resourcegroups->group->usergroups.userId$': this.id }, 36 | include: [ 37 | { 38 | model: ResourceGroup, 39 | require: true, 40 | include: [ 41 | { 42 | model: Group, 43 | require: true, 44 | include: [ 45 | { 46 | model: User.UserGroup, 47 | require: true 48 | } 49 | ] 50 | } 51 | ] 52 | } 53 | ] 54 | }) 55 | } 56 | 57 | Resource.prototype.addUtilization = async function (compute, memory) { 58 | let allocations = await this.getAllocations(Allocation.activeQuery) 59 | if (allocations && allocations.length > 0) { 60 | let allocation = allocations[0] 61 | allocation.cmemory = memory 62 | allocation.amemory += memory 63 | allocation.ccompute = compute 64 | allocation.acompute += compute 65 | allocation.samples++ 66 | await allocation.save() 67 | } 68 | } 69 | 70 | module.exports = Resource 71 | -------------------------------------------------------------------------------- /src/models/State-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | 4 | var State = sequelize.define('state', { 5 | id: { type: Sequelize.INTEGER, autoIncrement: true, primaryKey: true }, 6 | state: { type: Sequelize.INTEGER, allowNull: false }, 7 | since: { type: Sequelize.DATE, allowNull: false }, 8 | reason: { type: Sequelize.STRING, allowNull: true } 9 | }) 10 | 11 | module.exports = State 12 | -------------------------------------------------------------------------------- /src/models/User-model.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | const sequelize = require('./db.js') 3 | const config = require('../config.js') 4 | const Group = require('./Group-model.js') 5 | 6 | const fs = require('fs-extra') 7 | const { v4: uuidv4 } = require('uuid') 8 | const path = require('path') 9 | 10 | var User = sequelize.define('user', { 11 | id: { type: Sequelize.STRING, allowNull: false, primaryKey: true }, 12 | password: { type: Sequelize.STRING, allowNull: false }, 13 | admin: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }, 14 | fullname: { type: Sequelize.STRING, allowNull: true }, 15 | email: { type: Sequelize.STRING, allowNull: true } 16 | }) 17 | 18 | var UserGroup = User.UserGroup = sequelize.define('usergroup', { 19 | userId: { type: Sequelize.STRING, unique: 'pk' }, 20 | groupId: { type: Sequelize.STRING, unique: 'pk' } 21 | }) 22 | User.hasMany(UserGroup, { onDelete: 'cascade' }) 23 | Group.hasMany(UserGroup, { onDelete: 'cascade' }) 24 | UserGroup.belongsTo(User) 25 | UserGroup.belongsTo(Group) 26 | 27 | var AutoShare = User.AutoShare = sequelize.define('autoshare', { 28 | userId: { type: Sequelize.STRING, unique: 'pk' }, 29 | groupId: { type: Sequelize.STRING, unique: 'pk' } 30 | }) 31 | User.hasMany(AutoShare, { onDelete: 'cascade' }) 32 | Group.hasMany(AutoShare, { onDelete: 'cascade' }) 33 | AutoShare.belongsTo(User) 34 | AutoShare.belongsTo(Group) 35 | 36 | User.prototype.setAutoShares = async function (autoShares) { 37 | let autoShareGroups = await User.AutoShare.findAll({ where: { userId: this.id } }) 38 | let autoShareIds = autoShares.reduce((map, asId) => { map[asId] = true; return map }, {}) 39 | for (let asg of autoShareGroups) { 40 | if (!(asg.id in autoShareIds)) { 41 | await asg.destroy() 42 | } 43 | } 44 | autoShareIds = autoShareGroups.reduce((map, asg) => { map[asg.id] = true; return map }, {}) 45 | for(let asgId of autoShares) { 46 | if (!(asgId in autoShareIds)) { 47 | await User.AutoShare.create({ userId: this.id, groupId: asgId }) 48 | } 49 | } 50 | } 51 | 52 | const userPrefix = '/data/home/' 53 | 54 | User.prototype.isMemberOf = async function (group) { 55 | return group && await User.UserGroup.findOne({ where: { userId: this.id, groupId: group.id } }) 56 | } 57 | 58 | User.afterCreate(async user => { 59 | let userDir = userPrefix + user.id 60 | if (!(await fs.pathExists(userDir))) { 61 | await fs.mkdirp(userDir) 62 | } 63 | }) 64 | 65 | User.afterDestroy(async user => { 66 | let userDir = userPrefix + user.id 67 | if (await fs.pathExists(userDir)) { 68 | await fs.move(userDir, '/data/trash/' + uuidv4()) 69 | } 70 | }) 71 | 72 | User.getDir = (userId) => userPrefix + userId 73 | User.prototype.getDir = function () { 74 | return User.getDir(this.id) 75 | } 76 | 77 | User.getDirExternal = (userId) => path.join(config.mountRoot, 'home', userId + '') 78 | User.prototype.getDirExternal = function () { 79 | return User.getDirExternal(this.id) 80 | } 81 | 82 | module.exports = User 83 | -------------------------------------------------------------------------------- /src/models/db.js: -------------------------------------------------------------------------------- 1 | const Sequelize = require('sequelize') 2 | 3 | const config = require('../config.js') 4 | 5 | module.exports = new Sequelize('snakepit', 'postgres', '', { 6 | host: 'localhost', 7 | dialect: 'postgres', 8 | pool: { max: 10, min: 0, acquire: 30000, idle: 10000 }, 9 | logging: false 10 | }) 11 | -------------------------------------------------------------------------------- /src/models/index.js: -------------------------------------------------------------------------------- 1 | const glob = require('glob') 2 | const path = require('path') 3 | 4 | var exports = module.exports = { all: [] } 5 | 6 | glob.sync(__dirname + '/*-model.js').forEach(moduleName => { 7 | let modelName = path.basename(moduleName) 8 | modelName = modelName.substr(0, modelName.lastIndexOf('-')) 9 | let model = require(path.resolve(moduleName)) 10 | exports[modelName] = model 11 | exports.all.push(model) 12 | }) 13 | exports.sequelize = require('./db.js') -------------------------------------------------------------------------------- /src/pitRunner.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs-extra') 2 | const url = require('url') 3 | const path = require('path') 4 | const axios = require('axios') 5 | const assign = require('assign-deep') 6 | const Parallel = require('async-parallel') 7 | 8 | const lxd = require('./utils/lxd.js') 9 | const log = require('./utils/logger.js') 10 | const { to } = require('./utils/async.js') 11 | const { runScript } = require('./utils/scripts.js') 12 | const { envToScript } = require('./utils/scripts.js') 13 | const clusterEvents = require('./utils/clusterEvents.js') 14 | const Pit = require('./models/Pit-model.js') 15 | const Node = require('./models/Node-model.js') 16 | const config = require('./config.js') 17 | 18 | const snakepitPrefix = 'sp' 19 | const containerNameParser = /sp-([a-z][a-z0-9]*)-([0-9]+)-(d|0|[1-9][0-9]*)/ 20 | const utilParser = /[^,]+, ([0-9]+), ([0-9]+) \%, ([0-9]+) \%/ 21 | 22 | const headNode = Node.build({ 23 | id: 'head', 24 | endpoint: config.endpoint 25 | }) 26 | 27 | let currentContainers = {} 28 | async function getContainerNode (pitId, instance) { 29 | let pitContainers = currentContainers[pitId] 30 | let nodeId = pitContainers && pitContainers.find(c => c[2] == instance) 31 | return nodeId && nodeId[0] && await getNodeById(nodeId[0]) 32 | } 33 | 34 | async function getAllNodes () { 35 | let nodes = await Node.findAll() 36 | return [headNode, ...nodes] 37 | } 38 | 39 | async function getNodeById (nodeId) { 40 | return nodeId == 'head' ? headNode : await Node.findByPk(nodeId) 41 | } 42 | 43 | function getContainerName (nodeId, pitId, instance) { 44 | return snakepitPrefix + '-' + nodeId + '-' + pitId + '-' + instance 45 | } 46 | 47 | function getDaemonName (pitId) { 48 | return getContainerName(headNode.id, pitId, 'd') 49 | } 50 | 51 | function parseContainerName (containerName) { 52 | let match = containerNameParser.exec(containerName) 53 | return match && [match[1], match[2], match[3]] 54 | } 55 | 56 | async function getNodeFromName (containerName) { 57 | let parsed = parseContainerName(containerName) 58 | return await getNodeById(parsed[0]) 59 | } 60 | 61 | function getWorkerHost (nodeId, pitId, index) { 62 | return getContainerName(nodeId, pitId, index) + '.' + config.lxdDomain 63 | } 64 | exports.getWorkerHost = getWorkerHost 65 | 66 | function getNodeInfo (node) { 67 | return lxd.get(node.endpoint, '') 68 | } 69 | 70 | var headInfo 71 | async function getHeadInfo () { 72 | if (headInfo) { 73 | return headInfo 74 | } 75 | return headInfo = await getNodeInfo(headNode) 76 | } 77 | 78 | async function getHeadCertificate () { 79 | let info = await getHeadInfo() 80 | return info.environment && info.environment.certificate 81 | } 82 | 83 | async function getContainersOnNode (node) { 84 | let results = await lxd.get(node.endpoint, 'containers') 85 | let containers = [] 86 | for (let result of results) { 87 | let split = result.split('/') 88 | if (split.length > 0) { 89 | let container = split[split.length - 1] 90 | let containerInfo = parseContainerName(container) 91 | if (containerInfo && containerInfo[0] == node.id) { 92 | containers.push(container) 93 | } 94 | } 95 | } 96 | return containers 97 | } 98 | 99 | async function setContainerState (containerName, state, force, stateful) { 100 | let node = await getNodeFromName(containerName) 101 | await lxd.put(node.endpoint, 'containers/' + containerName + '/state', { 102 | action: state, 103 | timeout: config.containerTimeout, 104 | force: !!force, 105 | stateful: !!stateful 106 | }) 107 | } 108 | 109 | async function sendToContainer (containerName, filePath, content, options) { 110 | let node = await getNodeFromName(containerName) 111 | await lxd.post(node.endpoint, 'containers/' + containerName + '/files?path=' + filePath, content, assign({ 112 | headers: { 113 | 'Content-Type': 'plain/text', 114 | 'X-LXD-type': 'file' 115 | } 116 | }, options || {})) 117 | } 118 | 119 | function pitRequestedStop (pitId) { 120 | return fs.pathExists(path.join(Pit.getDir(pitId), 'stop')) 121 | } 122 | 123 | async function addContainer (containerName, imageHash, options) { 124 | let node = await getNodeFromName(containerName) 125 | let cert = await getHeadCertificate() 126 | let containerConfig = assign({ 127 | name: containerName, 128 | architecture: 'x86_64', 129 | profiles: [], 130 | ephemeral: false, 131 | devices: { 132 | 'root': { 133 | path: '/', 134 | pool: 'default', 135 | type: 'disk' 136 | } 137 | }, 138 | source: { 139 | type: 'image', 140 | mode: 'pull', 141 | server: config.endpoint, 142 | protocol: 'lxd', 143 | certificate: cert, 144 | fingerprint: imageHash 145 | }, 146 | }, options || {}) 147 | await lxd.post(node.endpoint, 'containers', containerConfig) 148 | } 149 | 150 | function generateKeyPair () { 151 | return new Promise((resolve, reject) => { 152 | runScript('keygen.sh', {}, async (code, stdout, stderr) => { 153 | if (code) { 154 | reject(code) 155 | return 156 | } 157 | let lines = stdout.split('\n') 158 | let splitter = lines.indexOf('-----BEGIN SSH-RSA PUBLIC KEY-----') 159 | if (splitter <= 0 || splitter >= lines.length - 1) { 160 | reject(2) 161 | return 162 | } 163 | resolve([ 164 | lines.slice(0, splitter).join('\n').trim(), 165 | lines.slice(splitter + 1).join('\n').trim() 166 | ]) 167 | }) 168 | }) 169 | } 170 | 171 | async function startPit (pitId, drives, workers) { 172 | try { 173 | clusterEvents.emit('pitStarting', pitId) 174 | 175 | let [key, keyPub] = await generateKeyPair() 176 | 177 | let pitDir = Pit.getDir(pitId) 178 | let daemonHash = (await lxd.get(headNode.endpoint, 'images/aliases/snakepit-daemon')).target 179 | let workerHash = (await lxd.get(headNode.endpoint, 'images/aliases/snakepit-worker')).target 180 | 181 | let daemonDevices = { 182 | 'pit': { 183 | path: '/data/rw/pit', 184 | source: Pit.getDirExternal(pitId), 185 | type: 'disk' 186 | }, 187 | 'eth0': { 188 | type: 'nic', 189 | nictype: 'bridged', 190 | parent: config.lxdBridge 191 | } 192 | } 193 | 194 | if (drives) { 195 | for (let dest of Object.keys(drives)) { 196 | daemonDevices[dest] = { 197 | path: dest, 198 | source: drives[dest], 199 | type: 'disk' 200 | } 201 | } 202 | } 203 | let daemonContainerName = getDaemonName(pitId) 204 | await addContainer( 205 | daemonContainerName, 206 | daemonHash, 207 | { 208 | devices: daemonDevices, 209 | config: { 'raw.idmap': 'both ' + config.mountUid + ' 2525' } 210 | } 211 | ) 212 | await setContainerState(daemonContainerName, 'start') 213 | await sendToContainer( 214 | daemonContainerName, 215 | '/home/worker/.ssh/authorized_keys', 216 | keyPub, 217 | { headers: { 218 | 'X-LXD-mode': '0644', 219 | 'X-LXD-gid': '2525', 220 | 'X-LXD-uid': '2525' 221 | } } 222 | ) 223 | 224 | await Parallel.each(workers, async function createWorker(worker) { 225 | let index = workers.indexOf(worker) 226 | let containerName = getContainerName(worker.node.id, pitId, index) 227 | let workerDir = path.join(pitDir, 'workers', '' + index) 228 | await fs.mkdirp(workerDir) 229 | await addContainer( 230 | containerName, 231 | workerHash, 232 | assign({ 233 | devices: { 234 | 'pit': { 235 | path: '/data/rw/pit', 236 | source: '/mnt/snakepit/pits/' + pitId, 237 | type: 'disk' 238 | }, 239 | 'eth0': { 240 | type: 'nic', 241 | nictype: 'bridged', 242 | parent: config.lxdBridge 243 | } 244 | }, 245 | config: { 'raw.idmap': 'both ' + config.mountUid + ' 0' } 246 | }, worker.options || {}) 247 | ) 248 | }) 249 | 250 | let daemonFQDN = daemonContainerName + '.' + config.lxdDomain 251 | await Parallel.each(workers, async worker => { 252 | let workerIndex = workers.indexOf(worker) 253 | let containerName = getContainerName(worker.node.id, pitId, workerIndex) 254 | await setContainerState(containerName, 'start') 255 | await sendToContainer( 256 | containerName, 257 | '/root/.ssh/id_rsa', 258 | key, 259 | { headers: { 260 | 'X-LXD-mode': '0600' 261 | } } 262 | ) 263 | await sendToContainer( 264 | containerName, 265 | '/root/.ssh/id_rsa.pub', 266 | keyPub, 267 | { headers: { 268 | 'X-LXD-mode': '0644' 269 | } } 270 | ) 271 | await sendToContainer(containerName, '/env.sh', envToScript(assign({ 272 | DAEMON: daemonFQDN, 273 | WORKER_INDEX: workerIndex 274 | }, worker.env), true)) 275 | }) 276 | clusterEvents.emit('pitStarted', pitId) 277 | } catch (ex) { 278 | clusterEvents.emit('pitStartFailed', pitId) 279 | await stopPit(pitId) 280 | throw ex 281 | } 282 | } 283 | exports.startPit = startPit 284 | 285 | function waitForPit (pitId, timeout) { 286 | return new Promise((resolve, reject) => { 287 | let timer 288 | let stopListener = (stoppingPitId) => { 289 | if (stoppingPitId == pitId) { 290 | if (timer) { 291 | clearTimeout(timer) 292 | } 293 | clusterEvents.removeListener('pitStopped', stopListener) 294 | resolve() 295 | } 296 | } 297 | let timeoutListener = () => { 298 | clusterEvents.removeListener('pitStopped', stopListener) 299 | reject('timeout') 300 | } 301 | clusterEvents.on('pitStopped', stopListener) 302 | if (timeout) { 303 | timer = setTimeout(timeoutListener, 1000 * timeout) 304 | } 305 | }) 306 | } 307 | exports.waitForPit = waitForPit 308 | 309 | async function runPit (pitId, drives, workers, timeout) { 310 | await startPit(pitId, drives, workers) 311 | await waitForPit(pitId, timeout) 312 | } 313 | exports.runPit = runPit 314 | 315 | async function exec (pitId, instance, context) { 316 | let node = await getContainerNode(pitId, instance) 317 | if (!node) { 318 | return 319 | } 320 | let containerName = getContainerName(node.id, pitId, instance) 321 | return await lxd.post( 322 | node.endpoint, 323 | 'containers/' + containerName + '/exec', 324 | assign({ 325 | 'interactive': true, 326 | 'wait-for-websocket': true, 327 | }, context), 328 | { openSocket: true } 329 | ) 330 | } 331 | exports.exec = exec 332 | 333 | function getLogPath (pitId) { 334 | return path.join(Pit.getDir(pitId), 'pit.log') 335 | } 336 | exports.getLogPath = getLogPath 337 | 338 | async function getLog (pitId) { 339 | try { 340 | return await fs.readFile(getLogPath(pitId)) 341 | } catch (ex) { 342 | return undefined 343 | } 344 | } 345 | exports.getLog = getLog 346 | 347 | async function getResults (pitId) { 348 | let pitDir = Pit.getDir(pitId) 349 | let workersDir = path.join(pitDir, 'workers') 350 | let workers = await fs.readdir(workersDir) 351 | workers = workers.map(w => parseInt(w)).filter(w => !isNaN(w)).sort((a, b) => a - b) 352 | let results = [] 353 | await Parallel.each(workers, async worker => { 354 | let result = {} 355 | let [errStatus, statusContent] = await to(fs.readFile(path.join(workersDir, worker + '', 'status'))) 356 | if (statusContent) { 357 | result.status = Number(statusContent.toString()) 358 | } 359 | let [errResult, resultContent] = await to(fs.readFile(path.join(workersDir, worker + '', 'result'))) 360 | if (resultContent) { 361 | result.result = resultContent.toString() 362 | } 363 | results[worker] = result 364 | }) 365 | return results 366 | } 367 | exports.getResults = getResults 368 | 369 | async function stopPit (pitId) { 370 | log.debug('Stopping pit', pitId) 371 | clusterEvents.emit('pitStopping', pitId) 372 | await to(fs.ensureFile(path.join(Pit.getDir(pitId), 'stop'))) 373 | } 374 | exports.stopPit = stopPit 375 | 376 | async function stopContainers (pitId) { 377 | let nodes = await getAllNodes() 378 | await Parallel.each(nodes, async node => { 379 | let [errC, containers] = await to(getContainersOnNode(node)) 380 | if (containers) { 381 | await Parallel.each(containers, async containerName => { 382 | let containerInfo = parseContainerName(containerName) 383 | if (containerInfo && containerInfo[1] == pitId) { 384 | let [errStop] = await to(setContainerState(containerName, 'stop', true)) 385 | let [errDelete] = await to(lxd.delete(node.endpoint, 'containers/' + containerName)) 386 | } 387 | }) 388 | } 389 | }) 390 | clusterEvents.emit('pitStopped', pitId) 391 | } 392 | 393 | async function tick () { 394 | let containers = {} 395 | let nodes = await getAllNodes() 396 | await to(Parallel.each(nodes, async node => { 397 | let [err, nodeContainers] = await to(getContainersOnNode(node)) 398 | let online = !!nodeContainers 399 | if (node == headNode) { 400 | if (err) { 401 | log.error('Problem accessing head node', err.toString()) 402 | } 403 | } else { 404 | if (online != node.online) { 405 | if (err) { 406 | log.error('Problem accessing node ' + node.id, err.toString()) 407 | } 408 | node.online = online 409 | node.since = Date.now() 410 | await node.save() 411 | } 412 | if (online) { 413 | let murl = url.parse(node.endpoint) 414 | let durl = 'http://' + murl.hostname + ':' + (parseInt(murl.port || 80) + 1) 415 | try { 416 | let utilizations = [] 417 | for (let data of (await axios.get(durl)).data.split('\n')) { 418 | if (data = utilParser.exec(data)) { 419 | utilizations.push(data) 420 | } 421 | } 422 | if (utilizations.length > 0) { 423 | let resources = await node.getResources() 424 | for (let i = 0; i < utilizations.length; i++) { 425 | let resource = resources.find(r => r.index == i) 426 | if (resource) { 427 | await resource.addUtilization( 428 | parseFloat(utilizations[i][2]) / 100.0, 429 | parseFloat(utilizations[i][3]) / 100.0 430 | ) 431 | } 432 | } 433 | } 434 | } catch (ex) {} 435 | } 436 | } 437 | if (!err && nodeContainers) { 438 | for (let containerName of nodeContainers) { 439 | let containerInfo = parseContainerName(containerName) 440 | if (containerInfo) { 441 | let pitId = containerInfo[1] 442 | let pitContainers = containers[pitId] 443 | if (!pitContainers) { 444 | pitContainers = containers[pitId] = [] 445 | } 446 | pitContainers.push(containerInfo) 447 | } 448 | } 449 | } 450 | })) 451 | clusterEvents.emit('containerReport', containers) 452 | let pits = Object.keys(containers) 453 | clusterEvents.emit('pitReport', pits) 454 | await Parallel.each(pits, async pitId => { 455 | if (!(await Pit.findByPk(pitId)) || (await pitRequestedStop(pitId))) { 456 | log.debug('Stopping zombie containers of pit', pitId) 457 | await stopContainers(pitId) 458 | } 459 | }) 460 | } 461 | 462 | clusterEvents.on('containerReport', containers => currentContainers = containers) 463 | 464 | function loop () { 465 | let goon = () => setTimeout(loop, config.pollInterval) 466 | tick().then(goon).catch(goon) 467 | } 468 | 469 | exports.startup = async function () { 470 | loop() 471 | } 472 | -------------------------------------------------------------------------------- /src/reservations.js: -------------------------------------------------------------------------------- 1 | const { MultiRange } = require('multi-integer-range') 2 | const sequelize = require('./models/db.js') 3 | const Node = require('./models/Node-model.js') 4 | const Resource = require('./models/Resource-model.js') 5 | const Alias = require('./models/Alias-model.js') 6 | const Job = require('./models/Job-model.js') 7 | const User = require('./models/User-model.js') 8 | const Group = require('./models/Group-model.js') 9 | const ProcessGroup = require('./models/ProcessGroup-model.js') 10 | const Process = require('./models/Process-model.js') 11 | const Allocation = require('./models/Allocation-model.js') 12 | const Sequelize = require('sequelize') 13 | const parseClusterRequest = require('./clusterParser.js').parse 14 | 15 | const log = require('./utils/logger.js') 16 | 17 | var exports = module.exports = {} 18 | 19 | async function loadAvailableResources (transaction, userId, simulation) { 20 | let lock = transaction && transaction.LOCK 21 | let nodes = {} 22 | let nodeWhere = { available: true } 23 | let having = [ 24 | sequelize.or( 25 | sequelize.where(sequelize.fn('count', sequelize.col('resourcegroups->group->usergroups->user.id')), { [Sequelize.Op.gt]: 0 }), 26 | sequelize.where(sequelize.fn('count', sequelize.col('resourcegroups->group.id')), 0) 27 | ) 28 | ] 29 | if (!simulation) { 30 | having.push(sequelize.where(sequelize.fn('count', sequelize.col('allocations->process->processgroup->job.id')), 0)) 31 | nodeWhere.online = true 32 | } 33 | let resources = await Resource.findAll({ 34 | include: [ 35 | { 36 | model: Node, 37 | attributes: [], 38 | transaction: transaction, 39 | lock: lock, 40 | where: nodeWhere 41 | }, 42 | { 43 | model: Alias, 44 | require: false, 45 | attributes: ['id'], 46 | transaction: transaction, 47 | lock: lock 48 | }, 49 | { 50 | model: Allocation, 51 | require: false, 52 | attributes: [], 53 | transaction: transaction, 54 | lock: lock, 55 | include: [ 56 | { 57 | model: Process, 58 | require: false, 59 | attributes: [], 60 | transaction: transaction, 61 | lock: lock, 62 | include: [ 63 | { 64 | model: ProcessGroup, 65 | require: false, 66 | attributes: [], 67 | transaction: transaction, 68 | lock: lock, 69 | include: [ 70 | { 71 | model: Job, 72 | require: false, 73 | attributes: [], 74 | transaction: transaction, 75 | lock: lock, 76 | where: { 77 | state: { 78 | [Sequelize.Op.gte]: Job.jobStates.STARTING, 79 | [Sequelize.Op.lte]: Job.jobStates.STOPPING 80 | } 81 | } 82 | } 83 | ] 84 | } 85 | ] 86 | } 87 | ] 88 | }, 89 | { 90 | model: Resource.ResourceGroup, 91 | require: false, 92 | attributes: [], 93 | transaction: transaction, 94 | lock: lock, 95 | include: [ 96 | { 97 | model: Group, 98 | require: false, 99 | attributes: [], 100 | transaction: transaction, 101 | lock: lock, 102 | include: [ 103 | { 104 | model: User.UserGroup, 105 | require: false, 106 | attributes: [], 107 | transaction: transaction, 108 | lock: lock, 109 | include: [ 110 | { 111 | model: User, 112 | require: false, 113 | attributes: [], 114 | transaction: transaction, 115 | lock: lock, 116 | where: { id: userId } 117 | } 118 | ] 119 | } 120 | ] 121 | } 122 | ] 123 | } 124 | ], 125 | group: ['resource.id', 'alias.id'], 126 | having: having 127 | }) 128 | for (let resource of resources) { 129 | let nodeResources = nodes[resource.nodeId] 130 | if (!nodeResources) { 131 | nodeResources = nodes[resource.nodeId] = {} 132 | } 133 | nodeResources[resource.id] = resource 134 | } 135 | return nodes 136 | } 137 | 138 | function reserveProcess (nodeResources, clusterReservation, resourceList) { 139 | let processReservation = {} 140 | for (let resource of resourceList) { 141 | let resourceCounter = resource.count 142 | for(let resourceId of Object.keys(nodeResources)) { 143 | if (resourceCounter > 0) { 144 | let nodeResource = nodeResources[resourceId] 145 | if ((nodeResource.name == resource.name || (nodeResource.alias && nodeResource.alias.id == resource.name)) && 146 | !clusterReservation[resourceId]) { 147 | processReservation[resourceId] = nodeResource 148 | resourceCounter-- 149 | } 150 | } 151 | } 152 | if (resourceCounter > 0) { 153 | return null 154 | } 155 | } 156 | return processReservation 157 | } 158 | 159 | function reservationSummary (clusterReservation) { 160 | if (!clusterReservation) { 161 | return 162 | } 163 | let nodes = {} 164 | for(let resource of Object.keys(clusterReservation).map(k => clusterReservation[k])) { 165 | let resources = nodes[resource.nodeId] 166 | if (resources) { 167 | resources.push(resource) 168 | } else { 169 | nodes[resource.nodeId] = [resource] 170 | } 171 | } 172 | let summary = '' 173 | for(let nodeId of Object.keys(nodes)) { 174 | let nodeResources = nodes[nodeId].filter(r => r.type) 175 | if (summary != '') { 176 | summary += ' + ' 177 | } 178 | summary += nodeId + '[' 179 | let first = true 180 | for(let type of nodeResources.map(r => r.type).filter((v, i, a) => a.indexOf(v) === i)) { 181 | let resourceIndices = nodeResources.filter(r => r.type == type).map(r => r.index) 182 | if (resourceIndices.length > 0) { 183 | if (!first) { 184 | summary += ' + ' 185 | } 186 | summary += type + ' ' + new MultiRange(resourceIndices.join(',')).getRanges() 187 | .map(range => range[0] == range[1] ? range[0] : range[0] + '-' + range[1]) 188 | .join(',') 189 | first = false 190 | } 191 | } 192 | summary += ']' 193 | } 194 | return summary 195 | } 196 | 197 | function requestedResources (clusterRequest) { 198 | if (typeof clusterRequest == 'string') { 199 | clusterRequest = parseClusterRequest(clusterRequest) 200 | } 201 | let resources = new Set() 202 | for (let groupRequest of clusterRequest) { 203 | for (let resource of groupRequest.process) { 204 | resources.add(resource.name) 205 | } 206 | } 207 | return resources 208 | } 209 | 210 | async function allocate (clusterRequest, userId, job) { 211 | if (typeof clusterRequest == 'string') { 212 | clusterRequest = parseClusterRequest(clusterRequest) 213 | } 214 | let simulation = !job 215 | let options 216 | try { 217 | if (simulation) { 218 | options = {} 219 | } else { 220 | let transaction = await sequelize.transaction({ type: Sequelize.Transaction.TYPES.EXCLUSIVE }) 221 | options = { transaction: transaction, lock: transaction.LOCK } 222 | } 223 | let availableResources = await loadAvailableResources(simulation.transaction, userId, simulation) 224 | let nodesWhere = { available: true } 225 | if (!simulation) { 226 | nodesWhere.online = true 227 | } 228 | let availableNodes = (await Node.findAll({ where: nodesWhere }, options)) 229 | let resourceCount = n => availableResources[n.id] ? Object.keys(availableResources[n.id]).length : 0 230 | availableNodes = availableNodes.sort((a, b) => resourceCount(a) - resourceCount(b)) 231 | let clusterReservation = {} 232 | for(let groupIndex = 0; groupIndex < clusterRequest.length; groupIndex++) { 233 | let groupRequest = clusterRequest[groupIndex] 234 | let jobProcessGroup 235 | if (!simulation) { 236 | jobProcessGroup = await ProcessGroup.create({ index: groupIndex }, options) 237 | await job.addProcessgroup(jobProcessGroup, options) 238 | } 239 | for(let processIndex = 0; processIndex < groupRequest.count; processIndex++) { 240 | let processReservation 241 | let jobProcess 242 | if (!simulation) { 243 | jobProcess = await Process.create({ index: processIndex }, options) 244 | await jobProcessGroup.addProcess(jobProcess, options) 245 | } 246 | for (let node of availableNodes) { 247 | let nodeResources = availableResources[node.id] || {} 248 | processReservation = reserveProcess(nodeResources, clusterReservation, groupRequest.process) 249 | if (processReservation) { 250 | clusterReservation['node ' + node.id] = { nodeId: node.id } 251 | if (!simulation) { 252 | jobProcess.nodeId = node.id 253 | await jobProcess.save(options) 254 | } 255 | break 256 | } 257 | } 258 | if (processReservation) { 259 | clusterReservation = Object.assign(clusterReservation, processReservation) 260 | if (!simulation) { 261 | for(let resource of Object.keys(processReservation).map(k => processReservation[k])) { 262 | await Allocation.create({ 263 | resourceId: resource.id, 264 | processId: jobProcess.id 265 | }, options) 266 | } 267 | } 268 | } else { 269 | if (!simulation) { 270 | await options.transaction.rollback() 271 | } 272 | return false 273 | } 274 | } 275 | } 276 | if (!simulation) { 277 | job.allocation = reservationSummary(clusterReservation) 278 | await job.save(options) 279 | await options.transaction.commit() 280 | } 281 | return true 282 | } catch (err) { 283 | log.error(err) 284 | options.transaction && await options.transaction.rollback() 285 | throw err 286 | } 287 | } 288 | 289 | exports.requestedResources = requestedResources 290 | exports.canAllocate = (request, user) => allocate(request, user.id) 291 | exports.tryAllocate = job => allocate(job.request, job.userId, job) 292 | -------------------------------------------------------------------------------- /src/routes/aliases.js: -------------------------------------------------------------------------------- 1 | const Router = require('express-promise-router') 2 | const Alias = require('../models/Alias-model.js') 3 | 4 | const { ensureSignedIn, ensureAdmin, tryTargetAlias, targetAlias } = require('./mw.js') 5 | 6 | const router = module.exports = new Router() 7 | 8 | router.use(ensureSignedIn) 9 | 10 | router.get('/', async (req, res) => { 11 | res.send((await Alias.findAll()).map(alias => alias.id)) 12 | }) 13 | 14 | router.get('/:alias', targetAlias, async (req, res) => { 15 | res.send({ 16 | id: req.targetAlias.id, 17 | name: req.targetAlias.name 18 | }) 19 | }) 20 | 21 | router.use(ensureAdmin) 22 | 23 | router.put('/:alias', tryTargetAlias, async (req, res) => { 24 | if(req.targetAlias) { 25 | return Promise.reject({ code: 400, message: 'Alias already existing' }) 26 | } 27 | if (req.body && req.body.name) { 28 | await Alias.create({ id: req.params.alias, name: req.body.name }) 29 | res.send() 30 | } else { 31 | res.status(400).send() 32 | } 33 | }) 34 | 35 | router.post('/:alias', targetAlias, async (req, res) => { 36 | if (req.body && req.body.name) { 37 | req.targetAlias.name = req.body.name 38 | await req.targetAlias.save() 39 | res.send() 40 | } else { 41 | res.status(400).send() 42 | } 43 | }) 44 | 45 | router.delete('/:alias', targetAlias, async (req, res) => { 46 | await req.targetAlias.destroy() 47 | res.send() 48 | }) 49 | -------------------------------------------------------------------------------- /src/routes/groups.js: -------------------------------------------------------------------------------- 1 | const Router = require('express-promise-router') 2 | const clusterEvents = require('../utils/clusterEvents.js') 3 | const config = require('../config.js') 4 | const simplefs = require('../utils/simplefs.js') 5 | const Group = require('../models/Group-model.js') 6 | const { ensureSignedIn, ensureAdmin, tryTargetGroup, targetGroup, memberOrAdmin } = require('./mw.js') 7 | 8 | const router = module.exports = new Router() 9 | 10 | router.use(ensureSignedIn) 11 | 12 | router.get('/', async (req, res) => { 13 | res.send((await Group.findAll()).map(group => group.id)) 14 | }) 15 | 16 | router.get('/:group', targetGroup, async (req, res) => { 17 | res.send({ 18 | id: req.targetGroup.id, 19 | title: req.targetGroup.title 20 | }) 21 | }) 22 | 23 | router.all('/:group/simplefs/' + simplefs.pattern, targetGroup, memberOrAdmin, async (req, res) => { 24 | let baseDir = Group.getDir(req.targetGroup.id) 25 | await simplefs.performCommand(baseDir, req, res) 26 | }) 27 | 28 | router.use(ensureAdmin) 29 | 30 | router.put('/:group', tryTargetGroup, async (req, res) => { 31 | if (req.targetGroup) { 32 | return Promise.reject({ code: 403, message: 'Group already existing' }) 33 | } 34 | if (req.body && req.body.title) { 35 | await Group.create({ 36 | id: req.params.group, 37 | title: req.body.title 38 | }) 39 | res.send() 40 | } else { 41 | res.status(400).send() 42 | } 43 | }) 44 | 45 | router.post('/:group', targetGroup, async (req, res) => { 46 | if (req.body && req.body.title) { 47 | req.targetGroup.title = req.body.title 48 | await req.targetGroup.save() 49 | res.send() 50 | } else { 51 | res.status(400).send() 52 | } 53 | }) 54 | 55 | router.delete('/:group', targetGroup, async (req, res) => { 56 | await req.targetGroup.destroy() 57 | res.send() 58 | clusterEvents.emit('restricted') 59 | }) 60 | -------------------------------------------------------------------------------- /src/routes/index.js: -------------------------------------------------------------------------------- 1 | const Router = require('express-promise-router') 2 | const simplefs = require('../utils/simplefs.js') 3 | const config = require('../config.js') 4 | 5 | const sharedDir = '/data/shared' 6 | 7 | var router = module.exports = new Router() 8 | 9 | router.get('/hello', async (req, res) => { 10 | res.send('Here I am') 11 | }) 12 | 13 | router.all('/shared/simplefs/' + simplefs.pattern, async (req, res) => { 14 | await simplefs.performCommand(sharedDir, req, res, true) 15 | }) 16 | 17 | router.use('/users', require('./users')) 18 | router.use('/groups', require('./groups')) 19 | router.use('/jobs', require('./jobs')) 20 | router.use('/nodes', require('./nodes')) 21 | router.use('/aliases', require('./aliases')) 22 | -------------------------------------------------------------------------------- /src/routes/jobs.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs-extra') 2 | const path = require('path') 3 | const Tail = require('tail').Tail 4 | const Parallel = require('async-parallel') 5 | const Sequelize = require('sequelize') 6 | const Router = require('express-promise-router') 7 | const Pit = require('../models/Pit-model.js') 8 | const Job = require('../models/Job-model.js') 9 | const config = require('../config.js') 10 | const scheduler = require('../scheduler.js') 11 | const pitRunner = require('../pitRunner.js') 12 | const reservations = require('../reservations.js') 13 | const parseClusterRequest = require('../clusterParser.js').parse 14 | 15 | const log = require('../utils/logger.js') 16 | const simplefs = require('../utils/simplefs.js') 17 | const clusterEvents = require('../utils/clusterEvents.js') 18 | const { getDuration } = require('../utils/dateTime.js') 19 | const { ensureSignedIn, ensureUpgrade, targetJob, targetInstance, targetGroup } = require('./mw.js') 20 | 21 | const jobStates = Job.jobStates 22 | 23 | var router = module.exports = new Router() 24 | 25 | router.use(ensureSignedIn) 26 | 27 | router.post('/', async (req, res) => { 28 | let job = req.body 29 | let clusterRequest 30 | try { 31 | clusterRequest = parseClusterRequest(job.clusterRequest) 32 | } catch (ex) { 33 | res.status(400).send({ message: 'Problem parsing allocation' }) 34 | return 35 | } 36 | if (!(await reservations.canAllocate(clusterRequest, req.user))) { 37 | res.status(406).send({ message: 'Cluster cannot fulfill resource request' }) 38 | return 39 | } 40 | if (job.continueJob) { 41 | let continueJob = await Job.findByPk(job.continueJob) 42 | if (!continueJob) { 43 | res.status(404).send({ message: 'The job to continue is not existing' }) 44 | return 45 | } 46 | if (!(await req.user.canAccessJob(continueJob))) { 47 | res.status(403).send({ message: 'Continuing provided job not allowed for current user' }) 48 | return 49 | } 50 | } 51 | 52 | let pit 53 | let newJob 54 | let archivePath 55 | try { 56 | let provisioning 57 | if (job.origin) { 58 | if (job.hash) { 59 | provisioning = 'Git commit ' + job.hash + ' from ' + job.origin 60 | } else { 61 | provisioning = 'Git clone of ' + job.origin 62 | } 63 | } else if (job.archive) { 64 | let basePath = req.user.getDir() 65 | archivePath = path.resolve(basePath, job.archive) 66 | if (!archivePath.startsWith(basePath)) { 67 | res.status(403).send({ message: 'Archive outside user home' }) 68 | return 69 | } 70 | if (!(await fs.pathExists(archivePath))) { 71 | res.status(404).send({ message: 'Archive not found' }) 72 | return 73 | } 74 | provisioning = 'Archive (' + (await fs.stat(basePath)).size + ' bytes)' 75 | } else { 76 | provisioning = 'Script' 77 | } 78 | if (job.diff) { 79 | provisioning += ' with ' + 80 | (job.diff + '').split('\n').length + ' LoC diff' 81 | } 82 | pit = await Pit.create() 83 | newJob = await Job.create({ 84 | id: pit.id, 85 | userId: req.user.id, 86 | description: ('' + job.description).substring(0,40), 87 | provisioning: provisioning, 88 | request: job.clusterRequest, 89 | continues: job.continueJob 90 | }) 91 | if (!job.private) { 92 | for(let autoshare of (await req.user.getAutoshares())) { 93 | await Job.JobGroup.create({ jobId: newJob.id, groupId: autoshare.groupId }) 94 | } 95 | } 96 | let files = {} 97 | files['script.sh'] = (job.script || 'if [ -f .compute ]; then bash .compute; fi') + '\n' 98 | if (job.origin) { 99 | files['origin'] = job.origin 100 | if (job.hash) { 101 | files['hash'] = job.hash 102 | } 103 | } else if (archivePath) { 104 | await fs.copy(archivePath, path.join(newJob.getDir(), 'archive.tar.gz')) 105 | } 106 | if (job.diff) { 107 | files['git.patch'] = job.diff + '\n' 108 | } 109 | let jobDir = Pit.getDir(pit.id) 110 | await Parallel.each(Object.keys(files), filename => fs.writeFile(path.join(jobDir, filename), files[filename])) 111 | await newJob.setState(jobStates.NEW) 112 | res.status(200).send({ id: pit.id }) 113 | } catch (ex) { 114 | if (newJob) { 115 | await newJob.destroy() 116 | } 117 | if (pit) { 118 | await pit.destroy() 119 | } 120 | res.status(500).send({ message: ex.toString() }) 121 | } 122 | }) 123 | 124 | function getJobDescription(job) { 125 | return { 126 | id: job.id, 127 | description: job.description, 128 | user: job.userId, 129 | resources: job.allocation || job.request, 130 | state: job.state, 131 | date: job.since, 132 | since: getDuration(new Date(), job.since), 133 | schedulePosition: job.rank, 134 | utilComp: job.state == jobStates.RUNNING ? job.dataValues.curcompute : 135 | (job.dataValues.aggcompute / (job.dataValues.samples || 1)), 136 | utilMem: job.state == jobStates.RUNNING ? job.dataValues.curmemory : 137 | (job.dataValues.aggmemory / (job.dataValues.samples || 1)) 138 | } 139 | } 140 | 141 | router.get('/', async (req, res) => { 142 | const orderings = { 143 | 'date': 'since', 144 | 'user': 'user', 145 | 'title': 'description', 146 | 'state': 'state' 147 | } 148 | let query = { where: {}, order: [], limit: config.queryLimit } 149 | const parseDate = v => { try { return new Date(v) } catch (ex) { return null } } 150 | let parsers = { 151 | since: v => parseDate(v) ? (query.where.since = { [Sequelize.Op.gte]: parseDate(v) }) : false, 152 | till: v => parseDate(v) ? (query.where.since = { [Sequelize.Op.lte]: parseDate(v) }) : false, 153 | user: v => query.where.userId = v, 154 | title: v => query.where.description = { [Sequelize.Op.like]: v }, 155 | asc: v => orderings[v] ? query.order.push([orderings[v], 'ASC']) : false, 156 | desc: v => orderings[v] ? query.order.push([orderings[v], 'DESC']) : false, 157 | limit: v => !isNaN(parseInt(v)) && (query.limit = Math.min(v, query.limit)), 158 | offset: v => !isNaN(parseInt(v)) && (query.offset = v) 159 | } 160 | for(let param of Object.keys(req.query)) { 161 | let parser = parsers[param] 162 | if (parser) { 163 | if (!parser(req.query[param])) { 164 | res.status(400).send({ message: 'Cannot parse query parameter ' + param }) 165 | return 166 | } 167 | } else { 168 | res.status(400).send({ message: 'Unknown query parameter ' + param }) 169 | return 170 | } 171 | } 172 | query.order.push(['since', 'DESC']) 173 | let jobs = await Job.findAll(Job.infoQuery(query)) 174 | res.send(jobs.map(job => getJobDescription(job))) 175 | }) 176 | 177 | router.get('/status', async (req, res) => { 178 | let query = Job.infoQuery({ 179 | where: { state: { [Sequelize.Op.gte]: jobStates.NEW, [Sequelize.Op.lte]: jobStates.STOPPING } } 180 | }) 181 | let jobs = await Job.findAll(query) 182 | let running = jobs 183 | .filter(j => j.state >= jobStates.STARTING && j.state <= jobStates.STOPPING) 184 | .sort((a,b) => a.id - b.id) 185 | let waiting = jobs 186 | .filter(j => j.state == jobStates.WAITING) 187 | .sort((a,b) => a.rank - b.rank) 188 | waiting = waiting.concat(jobs.filter(j => j.state == jobStates.PREPARING)) 189 | waiting = waiting.concat(jobs.filter(j => j.state == jobStates.NEW)) 190 | let done = await Job.findAll(Job.infoQuery({ 191 | where: { state: { [Sequelize.Op.gt]: jobStates.STOPPING } }, 192 | order: [['since', 'DESC']], 193 | limit: 20 194 | })) 195 | res.send({ 196 | running: running.map(job => getJobDescription(job)), 197 | waiting: waiting.map(job => getJobDescription(job)), 198 | done: done .map(job => getJobDescription(job)) 199 | }) 200 | }) 201 | 202 | router.get('/:job', async (req, res) => { 203 | let query = Job.infoQuery({ where: { id: req.params.job } }) 204 | let job = await Job.findOne(query) 205 | if (!job) { 206 | return Promise.reject({ code: 404, message: 'Job not found' }) 207 | } 208 | let description = getJobDescription(job) 209 | description.allocation = job.allocation 210 | description.clusterRequest = job.clusterRequest 211 | if (job.continues) { 212 | description.continueJob = job.continues 213 | } 214 | if(await req.user.canAccessJob(job)) { 215 | let groups = (await job.getJobgroups()).map(jg => jg.groupId) 216 | description.provisioning = job.provisioning 217 | description.groups = groups.length > 0 && groups 218 | description.stateChanges = (await job.getStates({ order: ['since'] })).map(s => ({ 219 | state: s.state, 220 | since: s.since, 221 | reason: s.reason 222 | })) 223 | let processes = [] 224 | for(let processGroup of await job.getProcessgroups()) { 225 | for(let jobProcess of await processGroup.getProcesses()) { 226 | processes.push({ 227 | groupIndex: processGroup.index, 228 | processIndex: jobProcess.index, 229 | status: (jobProcess.status === 0 || jobProcess.status > 0) ? jobProcess.status : '?', 230 | result: jobProcess.result 231 | }) 232 | } 233 | } 234 | if (processes.length > 0) { 235 | description.processes = processes 236 | } 237 | } 238 | res.send(description) 239 | }) 240 | 241 | async function canAccess (req, res) { 242 | return (await req.user.canAccessJob(req.targetJob)) ? Promise.resolve('next') : Promise.reject({ code: 403, message: 'Forbidden' }) 243 | } 244 | 245 | router.put('/:job/groups/:group', targetJob, canAccess, targetGroup, async (req, res) => { 246 | await Job.JobGroup.upsert({ jobId: req.targetJob.id, groupId: req.targetGroup.id }) 247 | res.send() 248 | }) 249 | 250 | router.delete('/:job/groups/:group', targetJob, canAccess, targetGroup, async (req, res) => { 251 | await Job.JobGroup.destroy({ where: { jobId: req.targetJob.id, groupId: req.targetGroup.id } }) 252 | res.send() 253 | clusterEvents.emit('restricted') 254 | }) 255 | 256 | router.all('/:job/simplefs/' + simplefs.pattern, targetJob, canAccess, async (req, res) => { 257 | let baseDir = Pit.getDir(req.targetJob.id) 258 | await simplefs.performCommand(baseDir, req, res) 259 | }) 260 | 261 | router.get('/:job/log', targetJob, canAccess, async (req, res) => { 262 | res.writeHead(200, { 263 | 'Connection': 'keep-alive', 264 | 'Content-Type': 'text/plain', 265 | 'Cache-Control': 'no-cache' 266 | }) 267 | req.connection.setTimeout(60 * 60 * 1000) 268 | let interval = config.pollInterval 269 | let logPath = path.join(Pit.getDir(req.targetJob.id), 'pit.log') 270 | 271 | if (req.targetJob.state < jobStates.DONE) { 272 | let tail 273 | let startTail = () => { 274 | tail = new Tail(logPath, { fromBeginning: true }) 275 | tail.on("line", line => !res.finished && res.write(line + '\n')) 276 | tail.on("error", stopTail) 277 | res.on('close', stopTail) 278 | res.on('end', stopTail) 279 | } 280 | let stopTail = () => { 281 | if (tail) { 282 | tail.unwatch() 283 | tail = null 284 | } 285 | res.end() 286 | } 287 | let poll = () => { 288 | if (tail) { 289 | req.targetJob.reload().then(() => { 290 | if (req.targetJob.state == jobStates.DONE) { 291 | stopTail() 292 | } else { 293 | setTimeout(poll, interval) 294 | } 295 | }).catch(stopTail) 296 | } else { 297 | if (fs.existsSync(logPath)) { 298 | startTail() 299 | } 300 | setTimeout(poll, interval) 301 | } 302 | } 303 | poll() 304 | } else if (fs.existsSync(logPath)) { 305 | let stream = fs.createReadStream(logPath) 306 | stream.on('data', chunk => res.write(chunk)) 307 | stream.on('end', res.end.bind(res)) 308 | } else { 309 | res.status(404).send() 310 | } 311 | }) 312 | 313 | router.get('/:job/instances/:instance/exec', ensureUpgrade, targetJob, targetInstance, canAccess, async (req, res) => { 314 | if (!req.query.context) { 315 | throw { code: 400, message: 'No command' } 316 | } 317 | let context = JSON.parse(req.query.context) 318 | let pitSockets = await pitRunner.exec(req.targetJob.id, req.targetInstance, context) 319 | if (!pitSockets) { 320 | throw { code: 404, message: 'Worker not active' } 321 | } 322 | res.openSocket(async client => { 323 | let stdin = pitSockets['0'] 324 | let control = pitSockets.control 325 | client.on('message', msg => { 326 | if (msg[0] == 0 && control.readyState === control.OPEN) { 327 | control.send(msg.slice(1)) 328 | } else if (msg[0] == 1 && stdin.readyState === stdin.OPEN) { 329 | stdin.send(msg.slice(1)) 330 | } 331 | }) 332 | let sendToClient = (buffer, n) => { 333 | if (client.readyState === client.OPEN) { 334 | client.send(Buffer.concat([ 335 | new Buffer([n]), 336 | Buffer.isBuffer(buffer) ? buffer : Buffer.from(buffer) 337 | ])) 338 | } 339 | } 340 | let sockets 341 | if (context.interactive) { 342 | sockets = [client, control, stdin] 343 | stdin.on('message', msg => sendToClient(msg, 1)) 344 | } else { 345 | let stdout = pitSockets['1'] 346 | let stderr = pitSockets['2'] 347 | sockets = [client, control, stdin, stdout, stderr] 348 | stdout.on('message', msg => sendToClient(msg, 1)) 349 | stderr.on('message', msg => sendToClient(msg, 2)) 350 | } 351 | control.on('message', msg => sendToClient(msg, 0)) 352 | let close = () => sockets.forEach(s => s && s.close()) 353 | sockets.forEach(s => s && s.on('close', close)) 354 | }) 355 | }) 356 | 357 | router.get('/:job/instances/:instance/forward', ensureUpgrade, targetJob, targetInstance, canAccess, async (req, res) => { 358 | let pitSockets = await pitRunner.exec(req.targetJob.id, req.targetInstance, { 359 | command: ['forwarder.sh'], 360 | interactive: false 361 | }) 362 | if (!pitSockets) { 363 | throw { code: 404, message: 'Worker not active' } 364 | } 365 | res.openSocket(async client => { 366 | let stdin = pitSockets['0'] 367 | let stdout = pitSockets['1'] 368 | let sockets = [client, stdin, stdout, pitSockets['2'], pitSockets['control']] 369 | let connected = true 370 | client.on('message', msg => connected && stdin .send(msg)) 371 | stdout.on('message', msg => connected && client.send(msg)) 372 | let close = () => { connected = false; sockets.forEach(s => s.close()) } 373 | sockets.forEach(s => s.on('close', close)) 374 | }) 375 | }) 376 | 377 | router.post('/:job/stop', targetJob, canAccess, async (req, res) => { 378 | if (req.targetJob.state <= jobStates.STOPPING) { 379 | await scheduler.stopJob(req.targetJob, 'Stopped by user ' + req.user.id) 380 | res.send() 381 | } else { 382 | res.status(412).send({ message: 'Only jobs before or in running state can be stopped' }) 383 | } 384 | }) 385 | 386 | router.delete('/:job', targetJob, canAccess, async (req, res) => { 387 | if (req.targetJob.state >= jobStates.DONE) { 388 | await req.targetJob.destroy() 389 | res.send() 390 | } else { 391 | res.status(412).send({ message: 'Only stopped jobs can be deleted' }) 392 | } 393 | }) 394 | -------------------------------------------------------------------------------- /src/routes/mw.js: -------------------------------------------------------------------------------- 1 | const jwt = require('jsonwebtoken') 2 | const bcrypt = require('bcrypt') 3 | 4 | const config = require('../config.js') 5 | const Alias = require('../models/Alias-model.js') 6 | const Group = require('../models/Group-model.js') 7 | const Job = require('../models/Job-model.js') 8 | const Node = require('../models/Node-model.js') 9 | const User = require('../models/User-model.js') 10 | 11 | const log = require('../utils/logger.js') 12 | const parseDuration = require('parse-duration') 13 | 14 | var exports = module.exports = {} 15 | 16 | async function targetAlias (req, res, ensure) { 17 | if (!/^[a-z][a-z0-9]*$/.test(req.params.alias)) { 18 | return Promise.reject({ code: 400, message: 'Wrong alias format' }) 19 | } 20 | req.targetAlias = await Alias.findByPk(req.params.alias) 21 | if (ensure && !req.targetAlias) { 22 | return Promise.reject({ code: 404, message: 'Alias not found'}) 23 | } 24 | return Promise.resolve('next') 25 | } 26 | exports.tryTargetAlias = (req, res) => targetAlias(req, res, false) 27 | exports.targetAlias = (req, res) => targetAlias(req, res, true) 28 | 29 | async function targetGroup (req, res, ensure) { 30 | if(!/^[a-z]+$/.test(req.params.group)) { 31 | return Promise.reject({ code: 400, message: 'Wrong group id format' }) 32 | } 33 | req.targetGroup = await Group.findByPk(req.params.group) 34 | if (ensure && !req.targetGroup) { 35 | return Promise.reject({ code: 404, message: 'Group not found'}) 36 | } 37 | return Promise.resolve('next') 38 | } 39 | exports.tryTargetGroup = (req, res) => targetGroup(req, res, false) 40 | exports.targetGroup = (req, res) => targetGroup(req, res, true) 41 | 42 | async function targetJob (req, res, ensure) { 43 | if(!/^[0-9]+$/.test(req.params.job)) { 44 | return Promise.reject({ code: 400, message: 'Wrong job id format' }) 45 | } 46 | req.targetJob = await Job.findByPk(req.params.job) 47 | if (ensure && !req.targetJob) { 48 | return Promise.reject({ code: 404, message: 'Job not found'}) 49 | } 50 | return Promise.resolve('next') 51 | } 52 | exports.tryTargetJob = (req, res) => targetJob(req, res, false) 53 | exports.targetJob = (req, res) => targetJob(req, res, true) 54 | 55 | async function targetInstance (req, res) { 56 | if(!/^([0-9]+)$/.test(req.params.instance)) { 57 | return Promise.reject({ code: 400, message: 'Wrong instance id format' }) 58 | } 59 | req.targetInstance = req.params.instance 60 | return Promise.resolve('next') 61 | } 62 | exports.targetInstance = targetInstance 63 | 64 | async function targetNode (req, res, ensure) { 65 | if (!/^[a-z][a-z0-9]*$/.test(req.params.node)) { 66 | return Promise.reject({ code: 400, message: 'Wrong node id format' }) 67 | } 68 | req.targetNode = await Node.findByPk(req.params.node) 69 | if (ensure && !req.targetNode) { 70 | return Promise.reject({ code: 404, message: 'Node not found'}) 71 | } 72 | return Promise.resolve('next') 73 | } 74 | exports.tryTargetNode = (req, res) => targetNode(req, res, false) 75 | exports.targetNode = (req, res) => targetNode(req, res, true) 76 | 77 | async function targetUser (req, res, ensure) { 78 | let id = req.params.user 79 | if (req.user && id == '~') { 80 | req.targetUser = req.user 81 | } else { 82 | if (!/^[a-z][a-z0-9]*$/.test(req.params.user)) { 83 | return Promise.reject({ code: 400, message: 'Wrong user id format' }) 84 | } 85 | req.targetUser = await User.findByPk(id) 86 | } 87 | if (ensure && !req.targetUser) { 88 | return Promise.reject({ code: 404, message: 'User not found'}) 89 | } 90 | return Promise.resolve('next') 91 | } 92 | exports.tryTargetUser = (req, res) => targetUser(req, res, false) 93 | exports.targetUser = (req, res) => targetUser(req, res, true) 94 | 95 | function signIn (req, res, ensure) { 96 | return new Promise((resolve, reject) => { 97 | const msgAut = 'Authentication:' 98 | let token = req.get('X-Auth-Token') 99 | if (token) { 100 | jwt.verify(token, config.tokenSecret, (err, decoded) => { 101 | if (err) { 102 | if (err.name == 'TokenExpiredError') { 103 | const msgExp = 'Token expired' 104 | log.debug(msgAut, msgExp) 105 | res.status(401).json({ message: msgExp }) 106 | } else { 107 | const msgInv = 'Invalid token' 108 | log.error(msgAut, msgInv) 109 | res.status(400).json({ message: msgInv }) 110 | } 111 | resolve() 112 | } else { 113 | if (decoded.exp > (new Date().getTime() + config.tokenTTL) / 1000) { 114 | const msgTTL = 'Token expiration too far in the future' 115 | log.error(msgAut, msgTTL, '- User:', decoded.user) 116 | res.status(401).json({ message: msgTTL }) 117 | resolve() 118 | } else { 119 | User.findByPk(decoded.user).then(user => { 120 | if (user) { 121 | log.debug(msgAut, 'Token verification successful', decoded) 122 | req.user = user 123 | resolve('next') 124 | } else { 125 | const msgUnk = 'Token for unknown user' 126 | log.error(msgAut, msgUnk, decoded.user) 127 | res.status(401).json({ message: msgUnk }) 128 | resolve() 129 | } 130 | }) 131 | } 132 | } 133 | }) 134 | } else if (ensure) { 135 | let msgNT = 'No token' 136 | log.error(msgAut, msgNT) 137 | reject({ code: 401, message: msgNT }) 138 | } else { 139 | resolve('next') 140 | } 141 | }) 142 | } 143 | exports.trySignIn = (req, res) => signIn(req, res, false) 144 | exports.ensureSignedIn = (req, res) => signIn(req, res, true) 145 | exports.ensureAdmin = (req, res, next) => { 146 | let checkAdmin = () => req.user.admin ? next() : res.status(403).send() 147 | req.user ? checkAdmin() : signIn(req, res, true).then(checkAdmin) 148 | } 149 | exports.selfOrAdmin = (req, res) => (req.user.id == req.targetUser.id || req.user.admin) ? 150 | Promise.resolve('next') : 151 | Promise.reject({ code: 403, message: 'Forbidden' }) 152 | exports.memberOrAdmin = async (req, res) => (req.user.admin || await req.user.isMemberOf(req.targetGroup)) ? 153 | Promise.resolve('next') : 154 | Promise.reject({ code: 403, message: 'Forbidden' }) 155 | 156 | exports.ensureUpgrade = (req, res) => res.openSocket ? 157 | Promise.resolve('next') : 158 | Promise.reject({ code: 404, message: 'Only web-socket upgrades' }) 159 | 160 | function verify (req, res, ensure) { 161 | req.verified = false 162 | if (req.body && req.body.verification && req.user) { 163 | return new Promise((resolve, reject) => { 164 | bcrypt.compare(req.body.verification, req.user.password, (err, result) => { 165 | if (result) { 166 | req.verified = true 167 | resolve('next') 168 | } else { 169 | log.error('Verification for user ' + req.user.id + ' failed') 170 | reject({ code: 401, message: 'Verfification failed' }) 171 | } 172 | }) 173 | }) 174 | } else if (ensure) { 175 | return Promise.reject({ code: 403, message: 'Only verified access' }) 176 | } else { 177 | return Promise.resolve('next') 178 | } 179 | } 180 | 181 | exports.tryVerify = (req, res) => verify(req, res, false) 182 | exports.ensureVerified = (req, res) => verify(req, res, true) 183 | -------------------------------------------------------------------------------- /src/routes/nodes.js: -------------------------------------------------------------------------------- 1 | const Router = require('express-promise-router') 2 | const Parallel = require('async-parallel') 3 | const fs = require('fs-extra') 4 | const path = require('path') 5 | const log = require('../utils/logger.js') 6 | const clusterEvents = require('../utils/clusterEvents.js') 7 | const pitRunner = require('../pitRunner.js') 8 | const Pit = require('../models/Pit-model.js') 9 | const Node = require('../models/Node-model.js') 10 | const Resource = require('../models/Resource-model.js') 11 | const { getAlias } = require('../models/Alias-model.js') 12 | const { getScript } = require('../utils/scripts.js') 13 | const { ensureSignedIn, ensureAdmin, tryTargetNode, targetNode, targetGroup } = require('./mw.js') 14 | 15 | const resourceParser = /resource:([^,]*),([^,]*),([^,]*)/ 16 | 17 | async function getResourcesFromScan (pitId) { 18 | let workers = await pitRunner.getResults(pitId) 19 | if (workers.length <= 0) { 20 | return 21 | } 22 | let resources = [] 23 | if (!(workers[0].result)) { 24 | return resources 25 | } 26 | for (let line of workers[0].result.split('\n')) { 27 | let match = resourceParser.exec(line) 28 | if (match) { 29 | let resource = Resource.build({ 30 | type: match[1], 31 | name: match[3], 32 | index: Number(match[2]) 33 | }) 34 | resources.push(resource) 35 | } 36 | } 37 | return resources 38 | } 39 | 40 | var router = module.exports = new Router() 41 | 42 | router.use(ensureSignedIn) 43 | 44 | router.get('/', async (req, res) => { 45 | res.status(200).send((await Node.findAll()).map(node => node.id)) 46 | }) 47 | 48 | router.get('/:node', targetNode, async (req, res) => { 49 | let dbResources = await req.targetNode.getResources() 50 | res.status(200).json({ 51 | id: req.targetNode.id, 52 | endpoint: req.targetNode.endpoint, 53 | online: req.targetNode.online, 54 | since: req.targetNode.since, 55 | resources: dbResources.length == 0 ? undefined : await Parallel.map(dbResources, async dbResource => { 56 | let dbGroups = await dbResource.getResourcegroups() 57 | return { 58 | type: dbResource.type, 59 | name: dbResource.name, 60 | index: dbResource.index, 61 | groups: dbGroups.length == 0 ? undefined : dbGroups.map(group => group.groupId), 62 | alias: await getAlias(dbResource.name) 63 | } 64 | }) 65 | }) 66 | }) 67 | 68 | router.use(ensureAdmin) 69 | 70 | router.put('/:node', tryTargetNode, async (req, res) => { 71 | let id = req.params.node 72 | let node = req.body 73 | if (req.targetNode) { 74 | res.status(400).send({ message: 'Node with same id already registered' }) 75 | } else if (node && node.endpoint && node.password) { 76 | let pit 77 | let dbnode 78 | try { 79 | dbnode = await Node.create({ 80 | id: id, 81 | endpoint: node.endpoint, 82 | password: node.password, 83 | online: true, 84 | since: Date.now(), 85 | available: false 86 | }) 87 | pit = await Pit.create() 88 | await fs.writeFile(path.join(pit.getDir(), 'script.sh'), getScript('scan.sh')) 89 | await pitRunner.runPit(pit.id, {}, [{ 90 | node: dbnode, 91 | devices: { 'gpu': { type: 'gpu' } } 92 | }]) 93 | let resources = await getResourcesFromScan(pit.id) 94 | if (resources) { 95 | resources.forEach(async resource => { 96 | await resource.save() 97 | await dbnode.addResource(resource) 98 | }) 99 | dbnode.online = true 100 | dbnode.available = true 101 | await dbnode.save() 102 | res.send() 103 | } else { 104 | throw new Error('Node scanning failed') 105 | } 106 | } catch (ex) { 107 | if (dbnode) { 108 | await dbnode.destroy() 109 | } 110 | res.status(400).send({ message: 'Problem adding node' }) 111 | } finally { 112 | if (pit) { 113 | await pit.destroy() 114 | } 115 | } 116 | } else { 117 | res.status(400).send() 118 | } 119 | }) 120 | 121 | router.delete('/:node', targetNode, async (req, res) => { 122 | await req.targetNode.destroy() 123 | res.send() 124 | }) 125 | 126 | router.put('/:node/groups/:group', targetNode, targetGroup, async (req, res) => { 127 | for (let resource of await req.targetNode.getResources()) { 128 | await Resource.ResourceGroup.upsert({ resourceId: resource.id, groupId: req.targetGroup.id }) 129 | } 130 | res.send() 131 | clusterEvents.emit('restricted') 132 | }) 133 | 134 | router.delete('/:node/groups/:group', targetNode, targetGroup, async (req, res) => { 135 | for (let resource of await req.targetNode.getResources()) { 136 | await Resource.ResourceGroup.destroy({ where: { resourceId: resource.id, groupId: req.targetGroup.id } }) 137 | } 138 | res.send() 139 | clusterEvents.emit('restricted') 140 | }) 141 | 142 | async function targetResource (req, res) { 143 | let targetResources = await req.targetNode.getResources({ where: { index: req.params.resource } }) 144 | req.targetResource = targetResources.length == 1 ? targetResources[0] : undefined 145 | return req.targetResource ? Promise.resolve('next') : Promise.reject({ code: 404, message: 'Resource not found' }) 146 | } 147 | 148 | router.put('/:node/resources/:resource/groups/:group', targetNode, targetResource, targetGroup, async (req, res) => { 149 | await Resource.ResourceGroup.upsert({ resourceId: req.targetResource.id, groupId: req.targetGroup.id }) 150 | res.send() 151 | clusterEvents.emit('restricted') 152 | }) 153 | 154 | router.delete('/:node/resources/:resource/groups/:group', targetNode, targetResource, targetGroup, async (req, res) => { 155 | await Resource.ResourceGroup.destroy({ where: { resourceId: req.targetResource.id, groupId: req.targetGroup.id } }) 156 | res.send() 157 | clusterEvents.emit('restricted') 158 | }) 159 | -------------------------------------------------------------------------------- /src/routes/users.js: -------------------------------------------------------------------------------- 1 | const bcrypt = require('bcrypt') 2 | const jwt = require('jsonwebtoken') 3 | const Router = require('express-promise-router') 4 | const config = require('../config.js') 5 | const simplefs = require('../utils/simplefs.js') 6 | const clusterEvents = require('../utils/clusterEvents.js') 7 | const log = require('../utils/logger.js') 8 | const Job = require('../models/Job-model.js') 9 | const User = require('../models/User-model.js') 10 | const { trySignIn, 11 | ensureSignedIn, 12 | ensureVerified, 13 | ensureAdmin, 14 | selfOrAdmin, 15 | tryTargetUser, 16 | targetUser, 17 | targetGroup } = require('./mw.js') 18 | 19 | var router = module.exports = new Router() 20 | 21 | router.get('/:user/exists', async (req, res) => { 22 | res.status((await User.findByPk(req.params.user)) ? 200 : 404).send() 23 | }) 24 | 25 | async function applyAndSaveUserConfig (targetUser, userConfig) { 26 | if (userConfig.fullname) { 27 | targetUser.fullname = userConfig.fullname 28 | } 29 | if (userConfig.email) { 30 | targetUser.email = userConfig.email 31 | } 32 | if (userConfig.password) { 33 | targetUser.password = await bcrypt.hash(userConfig.password, config.hashRounds) 34 | } 35 | await targetUser.save() 36 | } 37 | 38 | router.put('/:user', trySignIn, tryTargetUser, async (req, res) => { 39 | if (req.targetUser) { 40 | return Promise.reject({ code: 403, message: 'User already existing' }) 41 | } else { 42 | if (!(req.user && req.user.admin) && await Job.findOne({ where: { userId: req.params.user } })) { 43 | return Promise.reject({ code: 403, message: 'Only admins can re-create this account, as there are already jobs with this user-ID as owner.' }) 44 | } 45 | } 46 | await applyAndSaveUserConfig(User.build({ id: req.params.user }), req.body) 47 | res.send() 48 | }) 49 | 50 | router.post('/:user/authenticate', targetUser, async (req, res) => { 51 | bcrypt.compare(req.body.password, req.targetUser.password, (err, result) => { 52 | if(result) { 53 | jwt.sign( 54 | { user: req.targetUser.id }, 55 | config.tokenSecret, 56 | { expiresIn: config.tokenTTL / 1000 }, 57 | (err, token) => { 58 | if (err) { 59 | log.error('Problem signing JWT for user', req.targetUser.id) 60 | res.status(500).send() 61 | } else { 62 | res.status(200).send({ token: token }) 63 | } 64 | } 65 | ) 66 | } else { 67 | log.error('Wrong password - User', req.targetUser.id) 68 | res.status(400).send() 69 | } 70 | }) 71 | }) 72 | 73 | router.get('/', ensureAdmin, async (req, res) => { 74 | res.json((await User.findAll()).map(user => user.id)) 75 | }) 76 | 77 | router.use(ensureSignedIn) 78 | 79 | router.post('/:user', targetUser, selfOrAdmin, ensureVerified, async (req, res) => { 80 | let userConfig = req.body 81 | if (userConfig.admin === true) { 82 | if (req.user.admin) { 83 | req.targetUser.admin = true 84 | } else { 85 | return Promise.reject({ code: 403, message: 'Not allowed' }) 86 | } 87 | } else if (userConfig.admin === false) { 88 | req.targetUser.admin = false 89 | } 90 | await applyAndSaveUserConfig(req.targetUser, userConfig) 91 | if (userConfig.autoshare) { 92 | await req.targetUser.setAutoShares(userConfig.autoshare) 93 | } 94 | res.send() 95 | }) 96 | 97 | router.get('/:user', targetUser, selfOrAdmin, async (req, res) => { 98 | let groups = (await req.targetUser.getUsergroups()).map(ug => ug.groupId) 99 | let autoshares = (await req.targetUser.getAutoshares()).map(a => a.groupId) 100 | res.json({ 101 | id: req.targetUser.id, 102 | fullname: req.targetUser.fullname, 103 | email: req.targetUser.email, 104 | groups: groups.length > 0 ? groups : undefined, 105 | autoshare: autoshares.length > 0 ? autoshares : undefined, 106 | admin: req.targetUser.admin 107 | }) 108 | }) 109 | 110 | router.delete('/:user', targetUser, selfOrAdmin, async (req, res) => { 111 | await req.targetUser.destroy() 112 | res.send() 113 | }) 114 | 115 | router.put('/:user/groups/:group', ensureAdmin, targetUser, targetGroup, async (req, res) => { 116 | await User.UserGroup.upsert({ userId: req.targetUser.id, groupId: req.targetGroup.id }) 117 | res.send() 118 | }) 119 | 120 | router.delete('/:user/groups/:group', ensureAdmin, targetUser, targetGroup, async (req, res) => { 121 | await User.UserGroup.destroy({ where: { userId: req.targetUser.id, groupId: req.targetGroup.id } }) 122 | res.send() 123 | clusterEvents.emit('restricted') 124 | }) 125 | 126 | router.all('/:user/simplefs/' + simplefs.pattern, targetUser, selfOrAdmin, async (req, res) => { 127 | let baseDir = User.getDir(req.targetUser.id) 128 | await simplefs.performCommand(baseDir, req, res) 129 | }) 130 | -------------------------------------------------------------------------------- /src/scheduler.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | const Parallel = require('async-parallel') 3 | const log = require('./utils/logger.js') 4 | const { to } = require('./utils/async.js') 5 | const { runScript } = require('./utils/scripts.js') 6 | const clusterEvents = require('./utils/clusterEvents.js') 7 | const config = require('./config.js') 8 | const pitRunner = require('./pitRunner.js') 9 | const reservations = require('./reservations.js') 10 | const Job = require('./models/Job-model.js') 11 | const Group = require('./models/Group-model.js') 12 | 13 | const jobStates = Job.jobStates 14 | 15 | var exports = module.exports = {} 16 | 17 | var preparations = {} 18 | 19 | function getBasicEnv (job) { 20 | return { 21 | JOB_NUMBER: job.id, 22 | DATA_ROOT: '/data', 23 | JOB_DIR: job.getDir() 24 | } 25 | } 26 | 27 | function getPreparationEnv (job) { 28 | let env = getBasicEnv(job) 29 | if (job.continues) { 30 | env.CONTINUE_JOB_NUMBER = job.continues 31 | } 32 | return env 33 | } 34 | 35 | async function prepareJob (job) { 36 | let env = getPreparationEnv(job) 37 | await job.setState(jobStates.PREPARING) 38 | preparations[job.id] = runScript('prepare.sh', env, async (code, stdout, stderr) => { 39 | delete preparations[job.id] 40 | await job.reload() 41 | if (code == 0 && job.state == jobStates.PREPARING) { 42 | await job.setState(jobStates.WAITING) 43 | } else { 44 | if (code > 0) { 45 | log.debug('Problem during preparation phase of job', job.id, '- process returned', code) 46 | } 47 | await cleanJob( 48 | job, 49 | job.state != jobStates.STOPPING ? 50 | 'Problem during preparation step' : 51 | undefined 52 | ) 53 | } 54 | }) 55 | } 56 | 57 | function stopPreparation (jobId) { 58 | if (preparations[jobId]) { 59 | preparations[jobId].kill('SIGINT') 60 | } 61 | } 62 | 63 | async function startJob (job) { 64 | try { 65 | await job.setState(jobStates.STARTING) 66 | let user = await job.getUser() 67 | if (!user) { 68 | throw new Error("User not existing") 69 | } 70 | let jobEnv = getBasicEnv(job) 71 | 72 | jobEnv.JOB_DIR = '/data/rw/pit' 73 | jobEnv.SRC_DIR = jobEnv.WORK_DIR = '/data/rw/pit/src' 74 | let shares = { 75 | '/ro/shared': path.join(config.mountRoot, 'shared'), 76 | '/data/rw/home': user.getDirExternal() 77 | } 78 | jobEnv.DATA_ROOT = '/data' 79 | jobEnv.SHARED_DIR = '/data/ro/shared' 80 | jobEnv.USER_DIR = '/data/rw/home' 81 | for (let ug of (await user.getUsergroups())) { 82 | shares['/data/rw/group-' + ug.groupId] = Group.getDirExternal(ug.groupId) 83 | jobEnv[ug.groupId.toUpperCase() + '_GROUP_DIR'] = '/data/rw/group-' + ug.groupId 84 | } 85 | 86 | let workerShares = { 87 | '/data/ro/shared': '/mnt/snakepit/shared', 88 | '/data/rw/home': '/mnt/snakepit/home/' + user.id, 89 | // pit added in pitRunner.js (we don't know the pit id here) 90 | } 91 | for (let ug of (await user.getUsergroups())) { 92 | workerShares['/data/rw/group-' + ug.groupId] = '/mnt/snakepit/groups/' + ug.groupId 93 | jobEnv[ug.groupId.toUpperCase() + '_GROUP_DIR'] = '/data/rw/group-' + ug.groupId 94 | } 95 | let workerDiskDevices = {} 96 | for (let dest of Object.keys(workerShares)) { 97 | workerDiskDevices[dest.split('/').pop()] = { 98 | path: dest, 99 | source: workerShares[dest], 100 | type: 'disk' 101 | } 102 | } 103 | 104 | if (config.workerEnv) { 105 | for(let ev of Object.keys(config.workerEnv)) { 106 | jobEnv[ev] = config.workerEnv[ev] 107 | } 108 | } 109 | 110 | let workers = [] 111 | let processGroups = await job.getProcessgroups() 112 | jobEnv.NUM_GROUPS = processGroups.length 113 | for (let processGroup of processGroups) { 114 | let processes = await processGroup.getProcesses() 115 | jobEnv['NUM_PROCESSES_GROUP' + processGroup.index] = processes.length 116 | for (let jobProcess of processes) { 117 | let node = await jobProcess.getNode() 118 | jobEnv['HOST_GROUP' + processGroup.index + '_PROCESS' + jobProcess.index] = 119 | pitRunner.getWorkerHost(node.id, job.id, workers.length) 120 | let gpus = {} 121 | let allocations = await jobProcess.getAllocations() 122 | for (let allocation of allocations) { 123 | let resource = await allocation.getResource() 124 | if (resource.type == 'cuda') { 125 | gpus['gpu' + (resource.index + 1)] = { 126 | type: 'gpu', 127 | id: '' + (resource.index + 1) 128 | } 129 | } 130 | } 131 | let mergedDevices = Object.assign({}, gpus, workerDiskDevices) 132 | workers.push({ 133 | node: node, 134 | options: { devices: mergedDevices }, 135 | env: Object.assign({ 136 | GROUP_INDEX: processGroup.index, 137 | PROCESS_INDEX: jobProcess.index 138 | }, jobEnv) 139 | }) 140 | } 141 | } 142 | await pitRunner.startPit(job.id, shares, workers) 143 | await job.setState(jobStates.RUNNING) 144 | } catch (ex) { 145 | log.error('Problem starting job', job.id, ex) 146 | await job.reload() 147 | if (job.state < jobStates.CLEANING) { 148 | await cleanJob(job, 'Problem during startup: ' + ex.toString()) 149 | } 150 | } 151 | } 152 | 153 | async function stopJob (job, reason) { 154 | if (job.state >= jobStates.STARTING && job.state <= jobStates.STOPPING) { 155 | await job.setState(jobStates.STOPPING, reason) 156 | await pitRunner.stopPit(job.id) 157 | } else if (job.state == jobStates.PREPARING) { 158 | await job.setState(jobStates.STOPPING, reason) 159 | stopPreparation(job.id) 160 | } else if (job.state == jobStates.WAITING) { 161 | await cleanJob(job, reason) 162 | } else { 163 | await job.setState(jobStates.DONE, reason) 164 | } 165 | } 166 | exports.stopJob = stopJob 167 | 168 | async function cleanJob (job, reason) { 169 | await job.setState(jobStates.CLEANING, reason) 170 | let [err, results] = await to(pitRunner.getResults(job.id)) 171 | if (results) { 172 | let workerIndex = 0 173 | let processGroups = await job.getProcessgroups() 174 | for (let processGroup of processGroups) { 175 | let processes = await processGroup.getProcesses() 176 | for (let jobProcess of processes) { 177 | let workerResult = results[workerIndex] 178 | workerIndex++ 179 | if (workerResult) { 180 | jobProcess.result = workerResult.result 181 | jobProcess.status = workerResult.status 182 | await jobProcess.save() 183 | } 184 | } 185 | } 186 | } 187 | runScript('clean.sh', getPreparationEnv(job), async (code, stdout, stderr) => { 188 | await job.setState( 189 | jobStates.DONE, code > 0 ? 190 | ('Problem during cleaning step - exit code: ' + code + '\n' + stderr) : 191 | undefined 192 | ) 193 | }) 194 | } 195 | 196 | async function tick () { 197 | log.debug('Tick...') 198 | 199 | for (let job of (await Job.findAll({ where: { state: jobStates.NEW } }))) { 200 | if (Object.keys(preparations).length < config.maxParallelPrep) { 201 | log.debug('Preparing job', job.id) 202 | await prepareJob(job) 203 | } else { 204 | break 205 | } 206 | } 207 | 208 | let isPreparing = {} 209 | for (let job of (await Job.findAll({ where: { state: jobStates.PREPARING } }))) { 210 | if (job.since.getTime() + config.maxPrepDuration < Date.now()) { 211 | await stopJob(job, 'Exceeded max preparation time') 212 | } else { 213 | isPreparing[job.id] = true 214 | } 215 | } 216 | for (let jobId of Object.keys(preparations)) { 217 | if (!isPreparing[jobId]) { 218 | stopPreparation(jobId) 219 | log.error('Stopped orphan preparation process for job', jobId) 220 | } 221 | } 222 | 223 | let waitingFor = new Set() 224 | for (let job of await Job.findAll({ where: { state: jobStates.WAITING }, order: ['rank'] })) { 225 | let resources = reservations.requestedResources(job.request) 226 | if ([...resources].filter(x => waitingFor.has(x)).length === 0) { 227 | waitingFor = new Set([...waitingFor, ...resources]) 228 | log.debug('Trying to allocate job', job.id) 229 | if (await reservations.tryAllocate(job)) { 230 | log.debug('Starting job', job.id) 231 | await startJob(job) 232 | break 233 | } 234 | } 235 | } 236 | } 237 | 238 | function loop () { 239 | let goon = () => setTimeout(loop, config.pollInterval) 240 | tick().then(goon).catch(goon) 241 | } 242 | 243 | exports.startup = async function () { 244 | for (let job of (await Job.findAll({ where: { state: jobStates.PREPARING } }))) { 245 | await cleanJob(job, 'Job interrupted during preparation') 246 | } 247 | for (let job of (await Job.findAll({ where: { state: jobStates.CLEANING } }))) { 248 | await cleanJob(job, 'Job interrupted during cleaning') 249 | } 250 | for (let job of (await Job.findAll({ where: { state: jobStates.STOPPING } }))) { 251 | await cleanJob(job, 'Job interrupted during stopping') 252 | } 253 | for (let job of (await Job.findAll({ where: { state: jobStates.STARTING } }))) { 254 | await stopJob(job, 'Job interrupted during starting') 255 | } 256 | 257 | clusterEvents.on('restricted', async () => { 258 | for(let job of (await Job.findAll({ 259 | where: { '$between': [jobStates.PREPARING, jobStates.WAITING] } 260 | }))) { 261 | if (await reservations.canAllocate(job.resourceRequest, job.user)) { 262 | await stopJob(job, 'Cluster cannot fulfill resource request anymore') 263 | } 264 | } 265 | }) 266 | 267 | clusterEvents.on('pitStopping', async pitId => { 268 | let job = await Job.findByPk(pitId) 269 | if (job && job.state < jobStates.STOPPING) { 270 | await job.setState(jobStates.STOPPING) 271 | } 272 | }) 273 | 274 | clusterEvents.on('pitStopped', async pitId => { 275 | let job = await Job.findByPk(pitId) 276 | if (job && job.state < jobStates.CLEANING) { 277 | await cleanJob(job) 278 | } 279 | }) 280 | 281 | clusterEvents.on('pitReport', async pits => { 282 | await Parallel.each(pits, async pitId => { 283 | let job = await Job.findByPk(pitId) 284 | if (job && (job.state < jobStates.STARTING || job.state > jobStates.STOPPING)) { 285 | log.debug('Stopping zombie containers of stopped job', job.id) 286 | await pitRunner.stopPit(job.id) 287 | } 288 | }) 289 | }) 290 | 291 | loop() 292 | } 293 | -------------------------------------------------------------------------------- /src/service.js: -------------------------------------------------------------------------------- 1 | const cluster = require('cluster') 2 | const Parallel = require('async-parallel') 3 | const cpus = require('os').cpus().length 4 | const log = require('./utils/logger.js') 5 | const config = require('./config.js') 6 | const models = require('./models') 7 | const pitRunner = require('./pitRunner.js') 8 | const scheduler = require('./scheduler.js') 9 | 10 | async function startup () { 11 | await models.sequelize.sync() 12 | await Parallel.each(models.all, async model => await (model.startup || Function)()) 13 | await pitRunner.startup() 14 | await scheduler.startup() 15 | } 16 | 17 | if (cluster.isMaster) { 18 | cluster.on('exit', (deadWorker, code, signal) => { 19 | if (code === 100) { 20 | process.exit(100) // Preventing fork-loop on startup problems 21 | } 22 | var worker = cluster.fork(); 23 | log.error('Worker ' + deadWorker.process.pid + ' died.') 24 | log.info('Worker ' + worker.process.pid + ' born.') 25 | }) 26 | startup().then(() => { 27 | for (let i = 0; i < cpus; i++) { 28 | cluster.fork() 29 | } 30 | log.info('Snakepit daemon started') 31 | }).catch(ex => { 32 | log.error('Snakepit startup problem:', ex) 33 | process.exit(1) 34 | }) 35 | } else { 36 | try { 37 | const ws = require('ws') 38 | const http = require('http') 39 | const morgan = require('morgan') 40 | const express = require('express') 41 | const bodyParser = require('body-parser') 42 | 43 | let app = express() 44 | app.use(bodyParser.json({ limit: '50mb' })) 45 | app.use(morgan('combined', { 46 | skip: (req, res) => res.statusCode < 400 && !config.debugHttp 47 | })) 48 | 49 | app.use(require('./routes')) 50 | 51 | app.use((err, req, res, next) => { 52 | let message = err.message || 'Internal error' 53 | let code = err.code || 500 54 | log.error('ERROR', code, message) 55 | if (err.stack) { 56 | log.error(err.stack) 57 | } 58 | res.status(code).send({ message: message }) 59 | }) 60 | 61 | const wss = new ws.Server({ noServer: true }) 62 | let server = http.createServer(app) 63 | server.on('upgrade', (req, socket, header) => { 64 | let res = new http.ServerResponse(req) 65 | let headerClone = new Buffer(header.length) 66 | header.copy(headerClone) 67 | res.assignSocket(socket) 68 | res.on('finish', () => res.socket.destroy()) 69 | res.openSocket = cb => wss.handleUpgrade(req, socket, headerClone, cb) 70 | return app(req, res) 71 | }) 72 | server.listen(config.port, config.interface) 73 | log.info('Snakepit service running on ' + config.interface + ':' + config.port) 74 | } catch (ex) { 75 | log.error('Failure during startup: ', ex, ex.stack) 76 | 77 | process.exit(100) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/utils/async.js: -------------------------------------------------------------------------------- 1 | var exports = module.exports = {} 2 | 3 | exports.to = function (promise) { 4 | return promise.then(data => [null, data]).catch(err => [err]) 5 | } 6 | 7 | exports.sleep = function (ms) { 8 | return new Promise(resolve => setTimeout(resolve, ms)); 9 | } -------------------------------------------------------------------------------- /src/utils/clusterEvents.js: -------------------------------------------------------------------------------- 1 | const cluster = require('cluster') 2 | const { EventEmitter } = require('events') 3 | 4 | var emitter = module.exports = new EventEmitter() 5 | 6 | let originalEmit = emitter.emit 7 | 8 | function broadcast(message, ignore) { 9 | for(let wid in cluster.workers) { 10 | let worker = cluster.workers[wid] 11 | if (worker !== ignore) { 12 | worker.send(message) 13 | } 14 | } 15 | } 16 | 17 | if (cluster.isMaster) { 18 | cluster.on('fork', worker => { 19 | worker.on('message', message => { 20 | if (message.clusterEvent) { 21 | broadcast(message, worker) 22 | } 23 | }) 24 | }) 25 | } 26 | 27 | process.on('message', message => { 28 | if (message.clusterEvent) { 29 | originalEmit.apply(emitter, [message.clusterEvent, ...message.args]) 30 | } 31 | }) 32 | 33 | emitter.emit = function (clusterEvent, ...args) { 34 | originalEmit.apply(emitter, [clusterEvent, ...args]) 35 | let message = { 36 | clusterEvent: clusterEvent, 37 | args: args 38 | } 39 | if (cluster.isMaster) { 40 | broadcast(message) 41 | } else { 42 | process.send(message) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/utils/dateTime.js: -------------------------------------------------------------------------------- 1 | var exports = module.exports = {} 2 | 3 | exports.getDuration = function (date1, date2) { 4 | let delta = Math.abs(date2 - date1) / 1000 5 | let days = Math.floor(delta / 86400) 6 | delta -= days * 86400 7 | let hours = Math.floor(delta / 3600) % 24 8 | delta -= hours * 3600 9 | let minutes = Math.floor(delta / 60) % 60 10 | delta -= minutes * 60 11 | let seconds = Math.floor(delta % 60) 12 | return { 13 | days: days, 14 | hours: hours, 15 | minutes: minutes, 16 | seconds: seconds 17 | } 18 | } -------------------------------------------------------------------------------- /src/utils/logger.js: -------------------------------------------------------------------------------- 1 | const util = require('util') 2 | const cluster = require('cluster') 3 | const config = require('../config.js') 4 | 5 | exports.debug = function (...args) { 6 | log(0, ...args) 7 | } 8 | 9 | exports.info = function (...args) { 10 | log(1, ...args) 11 | } 12 | 13 | exports.error = function (...args) { 14 | log(2, ...args) 15 | } 16 | 17 | if (cluster.isMaster) { 18 | cluster.on('fork', worker => { 19 | worker.on('message', msg => { 20 | if (msg.logMessage) { 21 | log(msg.level, msg.msg) 22 | } 23 | }) 24 | }) 25 | } 26 | 27 | function log (level, ...args) { 28 | if (level >= config.logLevel) { 29 | let msg = args.map(a => (typeof a == 'string') ? a : util.inspect(a)).join(' ') 30 | if (cluster.isMaster) { 31 | level >= 2 ? console.error(msg) : console.log(msg) 32 | } else { 33 | process.send({ logMessage: true, level: level, msg: msg }) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/utils/lxd.js: -------------------------------------------------------------------------------- 1 | const https = require('https') 2 | const WebSocket = require('ws') 3 | const axios = require('axios') 4 | const assign = require('assign-deep') 5 | const Parallel = require('async-parallel') 6 | 7 | const log = require('../utils/logger.js') 8 | const { to } = require('../utils/async.js') 9 | const config = require('../config.js') 10 | 11 | const lxdStatus = { 12 | created: 100, 13 | started: 101, 14 | stopped: 102, 15 | running: 103, 16 | canceling: 104, 17 | pending: 105, 18 | starting: 106, 19 | stopping: 107, 20 | aborting: 108, 21 | freezing: 109, 22 | frozen: 110, 23 | thawed: 111, 24 | success: 200, 25 | failure: 400, 26 | cancelled: 401 27 | } 28 | 29 | var exports = module.exports = {} 30 | 31 | var agent = new https.Agent({ 32 | key: config.clientKey, 33 | cert: config.clientCert, 34 | rejectUnauthorized: false 35 | }) 36 | 37 | function getUrl (endpoint, resource) { 38 | return endpoint + '/1.0' + (resource ? ('/' + resource) : '') 39 | } 40 | 41 | async function wrapLxdResponse (endpoint, promise, options) { 42 | let response 43 | try { 44 | response = await promise 45 | } catch (ex) { 46 | log.debug('LXD error', ex.response && ex.response.data) 47 | throw ex 48 | } 49 | let data = response.data 50 | if (typeof data === 'string' || data instanceof String) { 51 | return data 52 | } else if (typeof data === 'object') { 53 | switch(data.type) { 54 | case 'sync': 55 | if (data.metadata) { 56 | if (data.metadata.err) { 57 | throw data.metadata.err 58 | } 59 | return data.metadata 60 | } else { 61 | return data 62 | } 63 | case 'async': 64 | if (options && options.openSocket) { 65 | log.debug('Opening socket:', data.operation + '/websocket') 66 | if (data.metadata && data.metadata.metadata && data.metadata.metadata.fds) { 67 | let wsEndpoint = endpoint.startsWith('http') ? ('ws' + endpoint.slice(4)) : endpoint 68 | let names = Object.keys(data.metadata.metadata.fds) 69 | let sockets = {} 70 | await Parallel.each(names, name => new Promise((resolve, reject) => { 71 | try { 72 | let wsc = new WebSocket( 73 | wsEndpoint + data.operation + '/websocket?secret=' + data.metadata.metadata.fds[name], 74 | null, 75 | { agent: agent } 76 | ) 77 | wsc.on('open', () => resolve(wsc)) 78 | wsc.on('error', reject) 79 | sockets[name] = wsc 80 | } catch (ex) { 81 | reject(ex) 82 | } 83 | })) 84 | return sockets 85 | } else { 86 | throw "Unable to open web-socket" 87 | } 88 | } else { 89 | log.debug('Forwarding:', data.operation + '/wait') 90 | return await wrapLxdResponse(endpoint, axios.get(endpoint + data.operation + '/wait', { httpsAgent: agent }), options) 91 | } 92 | case 'error': 93 | log.debug('LXD error', data.error) 94 | throw data.error 95 | } 96 | } 97 | } 98 | 99 | function callLxd(method, endpoint, resource, data, options) { 100 | let axiosConfig = assign({ 101 | method: method, 102 | url: getUrl(endpoint, resource), 103 | httpsAgent: agent, 104 | data: data, 105 | timeout: config.lxdTimeout 106 | }, options || {}) 107 | log.debug(method, axiosConfig.url, data || '') 108 | return wrapLxdResponse(endpoint, axios(axiosConfig), options) 109 | } 110 | 111 | exports.get = function (endpoint, resource, options) { 112 | return callLxd('get', endpoint, resource, undefined, options) 113 | } 114 | 115 | exports.delete = function (endpoint, resource, options) { 116 | return callLxd('delete', endpoint, resource, undefined, options) 117 | } 118 | 119 | exports.put = function (endpoint, resource, data, options) { 120 | return callLxd('put', endpoint, resource, data, options) 121 | } 122 | 123 | exports.post = function (endpoint, resource, data, options) { 124 | return callLxd('post', endpoint, resource, data, options) 125 | } 126 | -------------------------------------------------------------------------------- /src/utils/scripts.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | const stream = require('stream') 4 | const { spawn } = require('child_process') 5 | 6 | var exports = module.exports = {} 7 | 8 | function shellQuote (str) { 9 | str = '' + str 10 | str = str.replace(/\\/g, '\\\\') 11 | str = str.replace(/\'/g, '\\\'') 12 | str = str.replace(/(?:\r\n|\r|\n)/g, '\\n') 13 | str = '$\'' + str + '\'' 14 | return str 15 | } 16 | exports.shellQuote = shellQuote 17 | 18 | exports.envToScript = function (env, doExport) { 19 | let envScript = [] 20 | for (let name of Object.keys(env)) { 21 | envScript.push((doExport ? 'export ' : '') + name + '=' + shellQuote(env[name]) + '\n') 22 | } 23 | return envScript.join('') 24 | } 25 | 26 | var _loadedScripts = {} 27 | 28 | const _includePrefix = '#INCLUDE ' 29 | 30 | function _getScript (scriptName, alreadyIncluded) { 31 | if (alreadyIncluded.hasOwnProperty(scriptName)) { 32 | return '' 33 | } 34 | if (_loadedScripts.hasOwnProperty(scriptName)) { 35 | return _loadedScripts[scriptName] 36 | } 37 | let scriptPath = path.join(__dirname, '..', '..', 'scripts', scriptName) 38 | let script = fs.readFileSync(scriptPath).toString() 39 | alreadyIncluded[scriptName] = true 40 | script = script 41 | .split('\n') 42 | .map( 43 | l => l.startsWith(_includePrefix) ? 44 | _getScript(l.substring(_includePrefix.length), alreadyIncluded) : 45 | l 46 | ) 47 | .join('\n') 48 | return _loadedScripts[scriptName] = script 49 | } 50 | 51 | exports.getScript = function(scriptName) { 52 | return _getScript(scriptName, {}) 53 | } 54 | 55 | exports.runScript = function(scriptName, env, callback) { 56 | if (typeof env == 'function') { 57 | callback = env 58 | env = {} 59 | } 60 | env = env || {} 61 | let script = _getScript(scriptName, {}) 62 | //console.log('Running script "' + scriptPath + '"') 63 | p = spawn('bash', ['-s']) 64 | let stdout = [] 65 | p.stdout.on('data', data => stdout.push(data)) 66 | let stderr = [] 67 | p.stderr.on('data', data => stderr.push(data)) 68 | let called = false 69 | let callCallback = code => { 70 | if (!called) { 71 | called = true 72 | callback(code, stdout.join('\n'), stderr.join('\n')) 73 | } 74 | } 75 | p.on('close', code => callCallback(code)) 76 | p.on('error', err => callCallback(128)) 77 | p.on('exit', code => callCallback(code || 0)) 78 | var stdinStream = new stream.Readable() 79 | Object.keys(env).forEach(name => stdinStream.push( 80 | 'export ' + name + '=' + exports.shellQuote(env[name]) + '\n') 81 | ) 82 | stdinStream.push(script + '\n') 83 | stdinStream.push(null) 84 | stdinStream.pipe(p.stdin) 85 | return p 86 | } 87 | -------------------------------------------------------------------------------- /src/utils/simplefs.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs-extra') 2 | const path = require('path') 3 | const parseRange = require('range-parser') 4 | const { to } = require('./async.js') 5 | const log = require('./logger.js') 6 | 7 | var exports = module.exports = {} 8 | 9 | exports.pattern = ':aspect/(*)?' 10 | 11 | async function performCommand (basePath, req, res, readOnly) { 12 | let targetPath = path.resolve(basePath, req.params[0] || '') 13 | let aspect = req.params.aspect 14 | if (!targetPath.startsWith(basePath)) { 15 | return res.status(404).send() 16 | } 17 | let [statsErr, stats] = await to(fs.lstat(targetPath)) 18 | if (!statsErr && stats.isSymbolicLink()) { 19 | return res.status(403).send() 20 | } 21 | if (req.method === "GET") { 22 | if (statsErr || !(stats.isDirectory() || stats.isFile())) { 23 | res.status(404).send() 24 | } else if (aspect === 'stats') { 25 | res.send({ 26 | isFile: stats.isFile(), 27 | size: stats.size, 28 | mtime: stats.mtime, 29 | atime: stats.atime, 30 | ctime: stats.ctime 31 | }) 32 | } else if (aspect === 'content') { 33 | if (stats.isDirectory()) { 34 | let dirs = [] 35 | let files = [] 36 | let names = await fs.readdir(targetPath) 37 | const promises = names.map(async entry => { 38 | let ePath = path.join(targetPath, entry) 39 | let eStat = await fs.lstat(ePath) 40 | return { 41 | name: entry, 42 | isFile: eStat.isFile(), 43 | isDirectory: eStat.isDirectory() 44 | } 45 | }) 46 | let dirents = await Promise.all(promises) 47 | for (let dirent of dirents) { 48 | if (dirent.isFile) { 49 | files.push(dirent.name) 50 | } else if (dirent.isDirectory) { 51 | dirs.push(dirent.name) 52 | } 53 | } 54 | res.send({dirs: dirs, files: files}) 55 | } else { 56 | if (req.headers.range) { 57 | let ranges = parseRange(stats.size, req.headers.range) 58 | if (Array.isArray(ranges)) { 59 | if (ranges.type !== 'bytes' || ranges.length !== 1) { 60 | res.status(416).send() 61 | } else { 62 | let range = ranges[0] 63 | res.writeHead(206, { 64 | 'Content-Type': 'application/octet-stream', 65 | 'Content-Range': 'bytes ' + range.start + '-' + range.end + '/' + stats.size, 66 | 'Content-Length': '' + (range.end - range.start + 1) 67 | }) 68 | fs.createReadStream( 69 | targetPath, 70 | {start: range.start, end: range.end} 71 | ).pipe(res) 72 | } 73 | } else if (ranges === -1) { 74 | res.status(416).send() 75 | } else { 76 | res.status(400).send() 77 | } 78 | } else { 79 | res.writeHead(200, { 80 | 'Content-Type': 'application/octet-stream', 81 | 'Content-Length': stats.size 82 | }) 83 | fs.createReadStream(targetPath).pipe(res) 84 | } 85 | } 86 | } else { 87 | res.status(400).send() 88 | } 89 | } else if (req.method === "PUT" && !readOnly) { 90 | if (aspect === 'stats' && req.body && req.body.type) { 91 | let newSize = Number(req.body.size) || 0 92 | if (statsErr) { 93 | if (req.body.type === 'file') { 94 | let file 95 | try { 96 | file = await fs.open(targetPath, 'w') 97 | await fs.ftruncate(file, newSize) 98 | } catch (err) { 99 | if (err.code === 'ENOENT') { 100 | res.status(404) 101 | } else { 102 | throw err 103 | } 104 | } finally { 105 | if (file) { 106 | await fs.close(file) 107 | } 108 | res.send() 109 | } 110 | } else if (req.body.type === 'directory') { 111 | await fs.ensureDir(targetPath) 112 | res.send() 113 | } else { 114 | res.status(400).send() 115 | } 116 | } else { 117 | if (req.body.type === 'file' && stats.isFile()) { 118 | await fs.truncate(targetPath, newSize) 119 | res.send() 120 | } else { 121 | res.status(400).send() 122 | } 123 | } 124 | } else if (aspect === 'content' && !statsErr && stats.isFile()) { 125 | let offset = Number(req.headers['content-offset']) || 0 126 | await fs.truncate(targetPath, offset) 127 | let target = fs.createWriteStream(targetPath, {flags: 'a'}) 128 | await new Promise((resolve, reject) => { 129 | req.pipe(target) 130 | req.on('end', resolve) 131 | req.on('error', reject) 132 | }) 133 | res.send() 134 | } else { 135 | res.status(400).send() 136 | } 137 | } else if (req.method === "DELETE" && !readOnly) { 138 | if (aspect === 'stats' && !statsErr && (stats.isDirectory() || stats.isFile())) { 139 | await fs.remove(targetPath) 140 | res.send() 141 | } else { 142 | res.status(404).send() 143 | } 144 | } else { 145 | res.status(403).send() 146 | } 147 | } 148 | exports.performCommand = performCommand 149 | 150 | 151 | 152 | --------------------------------------------------------------------------------