├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── mergeable.yml └── workflows │ ├── executor-tests.yml │ └── unit-tests.yml ├── .gitignore ├── CONTRIBUTING.md ├── Development.md ├── LICENSE ├── README.md ├── Vagrantfile ├── cla ├── Project Cook_Corporate_Contributor_License_Agreement.docx └── Project Cook_Individual_Contributor_License_Agreement.docx ├── cli ├── .cs.json ├── .gitignore ├── README.md ├── cook │ ├── __init__.py │ ├── __main__.py │ ├── cli.py │ ├── configuration.py │ ├── dateparser.py │ ├── exceptions.py │ ├── format.py │ ├── http.py │ ├── mesos.py │ ├── metrics.py │ ├── plugins.py │ ├── progress.py │ ├── querying.py │ ├── subcommands │ │ ├── __init__.py │ │ ├── admin.py │ │ ├── cat.py │ │ ├── config.py │ │ ├── jobs.py │ │ ├── kill.py │ │ ├── ls.py │ │ ├── show.py │ │ ├── ssh.py │ │ ├── submit.py │ │ ├── tail.py │ │ ├── usage.py │ │ └── wait.py │ ├── terminal.py │ ├── util.py │ └── version.py ├── pytest.ini ├── setup.py ├── tests │ └── subcommands │ │ ├── __init__.py │ │ ├── test_dateparser.py │ │ └── test_querying.py └── travis │ └── setup.sh ├── cook.svg ├── dask └── docs │ └── design.md ├── executor ├── .dockerignore ├── .gitignore ├── Dockerfile.build ├── README.md ├── RELEASING.md ├── bin │ ├── build-docker.sh │ ├── build-local.sh │ ├── check-version.sh │ └── prepare-executor.sh ├── cook │ ├── __init__.py │ ├── __main__.py │ ├── _version.py │ ├── config.py │ ├── executor.py │ ├── io_helper.py │ ├── progress.py │ ├── subprocess.py │ └── util.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_config.py │ ├── test_executor.py │ ├── test_progress.py │ ├── test_subprocess.py │ └── utils.py └── travis │ ├── run_tests.sh │ └── setup.sh ├── integration ├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── bin │ ├── build-docker-image.sh │ ├── only-run │ └── run-integration.sh ├── requirements.txt ├── setup.cfg ├── tests │ ├── __init__.py │ ├── conftest.py │ └── cook │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── conftest.py │ │ ├── mesos.py │ │ ├── reasons.py │ │ ├── test_basic.py │ │ ├── test_cli.py │ │ ├── test_cli_multi_cluster.py │ │ ├── test_cli_subcommand_plugin.py │ │ ├── test_client.py │ │ ├── test_dynamic_clusters.py │ │ ├── test_impersonation.py │ │ ├── test_master_slave.py │ │ ├── test_multi_cluster.py │ │ ├── test_multi_user.py │ │ └── util.py └── travis │ ├── prepare_integration.sh │ ├── run_integration.sh │ └── scheduler_travis_config.edn ├── jobclient ├── README.md ├── java │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── twosigma │ │ │ └── cook │ │ │ └── jobclient │ │ │ ├── Application.java │ │ │ ├── Checkpoint.java │ │ │ ├── Disk.java │ │ │ ├── Executor.java │ │ │ ├── FetchableURI.java │ │ │ ├── Group.java │ │ │ ├── GroupListener.java │ │ │ ├── HostPlacement.java │ │ │ ├── Instance.java │ │ │ ├── InstanceDecorator.java │ │ │ ├── Job.java │ │ │ ├── JobClient.java │ │ │ ├── JobClientException.java │ │ │ ├── JobClientInterface.java │ │ │ ├── JobListener.java │ │ │ ├── StragglerHandling.java │ │ │ ├── auth │ │ │ └── spnego │ │ │ │ ├── BasicSPNegoSchemeFactory.java │ │ │ │ └── GSSCredentialProvider.java │ │ │ └── constraint │ │ │ ├── Constraint.java │ │ │ ├── Constraints.java │ │ │ ├── OneToOneConstraint.java │ │ │ └── Operator.java │ │ └── test │ │ └── java │ │ └── com │ │ └── twosigma │ │ ├── ConstraintTest.java │ │ └── cook │ │ └── jobclient │ │ ├── FetchableURITest.java │ │ ├── GroupTest.java │ │ ├── InstanceTest.java │ │ ├── JobClientTest.java │ │ └── JobTest.java └── python │ ├── README.md │ ├── cookclient │ ├── __init__.py │ ├── containers.py │ ├── instance.py │ ├── jobs.py │ └── util.py │ ├── docs │ ├── Makefile │ ├── make.bat │ └── source │ │ ├── api.rst │ │ ├── conf.py │ │ ├── index.rst │ │ └── usage.rst │ ├── requirements.txt │ ├── setup.py │ └── tests │ ├── test_instance.py │ └── test_job.py ├── scheduler ├── .dockerignore ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── README-k8s.md ├── README.adoc ├── api-only-config.edn ├── bin │ ├── bootstrap │ ├── build-docker-image.sh │ ├── help-delete-temporary-clusters │ ├── help-make-cluster │ ├── make-gke-test-cluster │ ├── make-gke-test-clusters │ ├── priority-class-cook-workload.yaml │ ├── priority-class-synthetic-pod.yaml │ ├── run-docker.sh │ ├── run-local-kubernetes.sh │ ├── run-local.sh │ ├── sample_launch.sh │ ├── start-datomic.sh │ └── submit-docker.sh ├── config-composite.edn ├── config-k8s.edn ├── config.edn ├── datomic │ ├── data │ │ ├── seed_k8s_pools.clj │ │ └── seed_running_jobs.clj │ ├── datomic-free-0.9.5561.56.zip │ └── datomic_transactor.properties ├── dev-config.edn ├── docker │ └── run-cook.sh ├── docs │ ├── clj-http-async-pool.md │ ├── concepts.md │ ├── configuration.adoc │ ├── dev-getting-started.md │ ├── faq.md │ ├── groups.md │ ├── kubernetes-state.dot │ ├── make-kubernetes-namespace.json │ ├── metatransactions.md │ ├── optimizer.md │ ├── reason-code │ ├── rebalancer-config.adoc │ ├── scheduler-rest-api.adoc │ └── simulator.md ├── example-prod-config.edn ├── java │ └── com │ │ ├── netflix │ │ └── fenzo │ │ │ └── SimpleAssignmentResult.java │ │ └── twosigma │ │ └── cook │ │ └── kubernetes │ │ ├── FinalizerHelper.java │ │ ├── ParallelWatchQueue.java │ │ ├── TokenRefreshingAuthenticator.java │ │ └── WatchHelper.java ├── liquibase │ ├── README.md │ └── changelog │ │ └── com │ │ └── twosigma │ │ └── cook │ │ └── changelogs │ │ └── setup.postgresql.sql ├── postgresql │ ├── README.md │ ├── bin │ │ ├── make-launch-postgres-docker.sh │ │ ├── setup-database.sh │ │ ├── setup-new-schema.sh │ │ ├── vagrant-setup-database.sh │ │ └── vagrant-setup-new-schema.sh │ └── sql │ │ ├── docker_init_new_database.sql │ │ ├── init_cook_database.sql │ │ ├── insert_rows_for_opensource_integration_tests.sql │ │ ├── reset_cook_database.sql │ │ └── reset_init_cook_database.sql ├── project.clj ├── simulator_files │ ├── analysis │ │ ├── .gitignore │ │ ├── README.md │ │ ├── analysis.ipynb │ │ ├── analysis │ │ │ └── __init__.py │ │ ├── requirements_dev.txt │ │ ├── setup.cfg │ │ ├── setup.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── analysis │ │ │ ├── __init__.py │ │ │ └── test_basic.py │ ├── example-config.edn │ ├── example-hosts.json │ ├── example-out-trace.csv │ └── example-trace.json ├── src │ ├── cook │ │ ├── cache.clj │ │ ├── cached_queries.clj │ │ ├── caches.clj │ │ ├── components.clj │ │ ├── compute_cluster.clj │ │ ├── compute_cluster │ │ │ └── metrics.clj │ │ ├── config.clj │ │ ├── config_incremental.clj │ │ ├── curator.clj │ │ ├── datomic.clj │ │ ├── group.clj │ │ ├── kubernetes │ │ │ ├── api.clj │ │ │ ├── compute_cluster.clj │ │ │ ├── controller.clj │ │ │ └── metrics.clj │ │ ├── log_structured.clj │ │ ├── mesos.clj │ │ ├── mesos │ │ │ ├── heartbeat.clj │ │ │ ├── mesos_compute_cluster.clj │ │ │ ├── mesos_mock.clj │ │ │ ├── reason.clj │ │ │ ├── sandbox.clj │ │ │ └── task.clj │ │ ├── monitor.clj │ │ ├── passport.clj │ │ ├── plugins │ │ │ ├── adjustment.clj │ │ │ ├── completion.clj │ │ │ ├── definitions.clj │ │ │ ├── demo_plugin.clj │ │ │ ├── file.clj │ │ │ ├── job_submission_modifier.clj │ │ │ ├── launch.clj │ │ │ ├── pool.clj │ │ │ ├── pool_mover.clj │ │ │ ├── submission.clj │ │ │ └── util.clj │ │ ├── pool.clj │ │ ├── postgres.clj │ │ ├── progress.clj │ │ ├── prometheus_metrics.clj │ │ ├── queries.clj │ │ ├── queue_limit.clj │ │ ├── quota.clj │ │ ├── rate_limit.clj │ │ ├── rate_limit │ │ │ ├── generic.clj │ │ │ └── token_bucket_filter.clj │ │ ├── rebalancer.clj │ │ ├── regexp_tools.clj │ │ ├── reporter.clj │ │ ├── rest │ │ │ ├── api.clj │ │ │ ├── authorization.clj │ │ │ ├── basic_auth.clj │ │ │ ├── cors.clj │ │ │ ├── impersonation.clj │ │ │ ├── secret.clj │ │ │ └── spnego.clj │ │ ├── scheduler │ │ │ ├── constraints.clj │ │ │ ├── dru.clj │ │ │ ├── fenzo_utils.clj │ │ │ ├── offer.clj │ │ │ ├── optimizer.clj │ │ │ ├── scheduler.clj │ │ │ └── share.clj │ │ ├── schema.clj │ │ ├── scratch.clj │ │ ├── task.clj │ │ ├── task_stats.clj │ │ ├── test │ │ │ ├── postgres.clj │ │ │ └── testutil.clj │ │ ├── tools.clj │ │ ├── unscheduled.clj │ │ └── util.clj │ ├── fork │ │ └── metrics_clojure │ │ │ ├── LICENSE.markdown │ │ │ ├── README.txt │ │ │ └── metrics │ │ │ └── jvm │ │ │ └── core.clj │ └── metatransaction │ │ ├── core.clj │ │ └── utils.clj ├── test-resources │ └── log4j.properties ├── test │ ├── cook │ │ └── test │ │ │ ├── benchmark.clj │ │ │ ├── cache.clj │ │ │ ├── components.clj │ │ │ ├── compute_cluster.clj │ │ │ ├── config.clj │ │ │ ├── config_incremental.clj │ │ │ ├── group.clj │ │ │ ├── jobclient │ │ │ └── jobclient.clj │ │ │ ├── kubernetes │ │ │ ├── api.clj │ │ │ ├── compute_cluster.clj │ │ │ └── controller.clj │ │ │ ├── log_structured.clj │ │ │ ├── mesos.clj │ │ │ ├── mesos │ │ │ ├── heartbeat.clj │ │ │ ├── mesos_compute_cluster.clj │ │ │ ├── mesos_mock.clj │ │ │ ├── reason.clj │ │ │ ├── sandbox.clj │ │ │ └── task.clj │ │ │ ├── monitor.clj │ │ │ ├── plugins.clj │ │ │ ├── plugins │ │ │ ├── job_submission_modifier.clj │ │ │ ├── pool.clj │ │ │ └── submission.clj │ │ │ ├── pool.clj │ │ │ ├── progress.clj │ │ │ ├── queue_limit.clj │ │ │ ├── quota.clj │ │ │ ├── rate_limit │ │ │ ├── generic.clj │ │ │ └── token_bucket_filter.clj │ │ │ ├── rebalancer.clj │ │ │ ├── regexp_tools.clj │ │ │ ├── rest │ │ │ ├── api.clj │ │ │ ├── authorization.clj │ │ │ ├── basic_auth.clj │ │ │ ├── cors.clj │ │ │ └── impersonation.clj │ │ │ ├── scheduler │ │ │ ├── constraints.clj │ │ │ ├── dru.clj │ │ │ ├── fenzo_utils.clj │ │ │ ├── optimizer.clj │ │ │ ├── scheduler.clj │ │ │ └── share.clj │ │ │ ├── schema.clj │ │ │ ├── task.clj │ │ │ ├── tools.clj │ │ │ ├── unscheduled.clj │ │ │ ├── util.clj │ │ │ └── zz_simulator.clj │ └── metatransaction │ │ ├── core_test.clj │ │ └── utils_test.clj └── travis │ └── setup.sh ├── sidecar ├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── cook │ ├── __init__.py │ └── sidecar │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── config.py │ │ ├── exit_sentinel.py │ │ ├── file_server.py │ │ ├── progress.py │ │ ├── tracker.py │ │ ├── util.py │ │ └── version.py └── setup.py ├── simulator ├── README.md ├── config │ ├── larger_cluster_simulation.edn │ └── settings.edn ├── doc │ └── development.md ├── project.clj ├── resources │ └── job_schedule.edn ├── src │ ├── dev │ │ └── cook │ │ │ └── sim │ │ │ └── repl.clj │ └── main │ │ └── cook │ │ └── sim │ │ ├── cli.clj │ │ ├── database.clj │ │ ├── reporting.clj │ │ ├── reporting │ │ └── groups.clj │ │ ├── runner.clj │ │ ├── schedule.clj │ │ ├── system.clj │ │ ├── travis.clj │ │ └── util.clj └── travis │ ├── prepare_simulation.sh │ ├── run_simulation.sh │ ├── scheduler_config.edn │ └── simulator_config.edn ├── spark ├── 0001-Add-cook-support-for-spark-v1.5.0.patch ├── 0001-Add-cook-support-for-spark-v1.6.1.patch └── README.md └── travis ├── build_cook_executor.sh ├── gdrive_upload ├── install_mesos.sh ├── minimesos ├── minimesosFile ├── prepare.sh ├── show_executor_logs.sh ├── show_scheduler_logs.sh ├── start_scheduler.sh └── upload_logs.sh /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **To Reproduce** 11 | Steps to reproduce the behavior: 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | 20 | **Screenshots** 21 | If applicable, add screenshots to help explain your problem. 22 | 23 | **Additional context** 24 | Add any other context about the problem here. 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Changes proposed in this PR 2 | 3 | - 4 | - 5 | - 6 | 7 | ## Why are we making these changes? 8 | 9 | 10 | -------------------------------------------------------------------------------- /.github/mergeable.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | mergeable: 3 | - when: pull_request.* 4 | validate: 5 | - do: label 6 | must_include: 7 | regex: '^internal-green$' 8 | must_exclude: 9 | regex: '^wip$' 10 | -------------------------------------------------------------------------------- /.github/workflows/executor-tests.yml: -------------------------------------------------------------------------------- 1 | name: Cook Executor tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | push: 8 | branches: 9 | - master 10 | - 'build**' 11 | - kubernetes_integration 12 | schedule: 13 | - cron: '0 0 * * *' 14 | 15 | jobs: 16 | test: 17 | runs-on: ubuntu-20.04 18 | env: 19 | PYTEST_ADDOPTS: --color=yes 20 | MESOS_NATIVE_JAVA_LIBRARY: /usr/lib/libmesos.so 21 | CLJ_HTTP_ASYNC_POOL_TEST_DURATION_MULTIPLIER: 5 22 | GDRIVE_LOG_POST_URL: https://script.google.com/macros/s/AKfycbxOB55OzrQSbpZO_0gzsxZaJ8LaUWWo3PDLNc-gCiMN1iObxu7x/exec 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up JDK 11 26 | uses: actions/setup-java@v1 27 | with: 28 | java-version: '11' 29 | - name: Cache Maven packages 30 | uses: actions/cache@v2 31 | with: 32 | path: ~/.m2 33 | key: ${{ runner.os }}-m2-${{ hashFiles('**/project.clj') }} 34 | restore-keys: ${{ runner.os }}-m2 35 | - name: Set up Python 36 | uses: actions/setup-python@v2 37 | with: 38 | python-version: '3.6.x' 39 | - name: Cache pip 40 | uses: actions/cache@v2 41 | with: 42 | path: ~/.cache/pip 43 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} 44 | restore-keys: | 45 | ${{ runner.os }}-pip- 46 | - name: Setup tests 47 | run: cd executor && ./travis/setup.sh && env 48 | - name: Run tests 49 | run: env && cd executor && ./travis/run_tests.sh 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.iml 3 | *.jar 4 | *.swo 5 | *.swp 6 | *~ 7 | *.pyc 8 | .DS_Store 9 | .idea/ 10 | .vscode/ 11 | .kill_lein 12 | .lein-cljsbuild-compiler* 13 | .lein-deps-sum 14 | .lein-failures 15 | .lein-plugins 16 | .lein-repl-history 17 | .minimesos/ 18 | .nrepl-port 19 | .vim_jack_in 20 | /checkouts 21 | /classes 22 | /lib 23 | /log 24 | /target 25 | __pycache__/ 26 | gclog* 27 | jobclient/src/cfg 28 | pom.xml 29 | pom.xml.asc 30 | scheduler/checkouts 31 | scheduler/log/ 32 | scheduler/resources/public/cook-executor* 33 | scheduler/src/cfg 34 | scheduler/trace*.csv 35 | scheduler/.cook_kubeconfig_* 36 | scheduler/cook.p12 37 | scheduler/datomic/datomic-free-0.9.5561.56/ 38 | src/cfg/current.clj 39 | target 40 | test-log 41 | *.orig 42 | venv 43 | dist 44 | .vagrant/ 45 | jobclient/python/docs/build/ 46 | *.egg-info/ 47 | .clj-kondo/ 48 | .lsp/ 49 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | In order to accept your code contributions, please fill out the appropriate Contributor License Agreement in the `cla` folder and submit it to tsos@twosigma.com. 4 | 5 | In your pull request, add a line in the [changelog](CHANGELOG.md) under "unreleased" describing your change. 6 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | Vagrant.configure("2") do |config| 2 | config.vm.box = "hashicorp/bionic64" 3 | config.vm.network "forwarded_port", guest: 5432, host: 5432 4 | config.vm.network "forwarded_port", guest: 12321, host: 12321 5 | config.vm.provider "virtualbox" do |v| 6 | v.memory = 6144 7 | v.cpus = 2 8 | end 9 | 10 | # This runs as root: 11 | config.vm.provision "bootstrap_as_root", type: "shell", path: "scheduler/bin/bootstrap", env: { 12 | "PGPASSWORD" => ENV["PGPASSWORD"], 13 | "GKE_CLUSTER_OWNER" => ENV["USER"], 14 | "GCP_PROJECT_NAME" => ENV["GCP_PROJECT_NAME"]} 15 | 16 | # This runs as vagrant: 17 | $script = <<-SCRIPT 18 | repo_root=/vagrant 19 | bashrc=$HOME/.bashrc 20 | 21 | # Cook java jobclient setup 22 | cd $repo_root/jobclient/java || exit 1 23 | mvn install -DskipTests 24 | 25 | # Python setup 26 | pip3 install --upgrade pip 27 | pip3 install --upgrade setuptools 28 | pip3 install --upgrade wheel 29 | pip3 install --upgrade virtualenv 30 | cd $repo_root || exit 1 31 | venv=$repo_root/venv 32 | rm -rf $venv 33 | $HOME/.local/bin/virtualenv venv --python=python3.6 34 | source $venv/bin/activate 35 | echo "source $venv/bin/activate" | tee -a $bashrc 36 | export PATH=$venv/bin:$PATH 37 | echo 'export PATH='$venv'/bin:$PATH' | tee -a $bashrc 38 | 39 | # Integration tests setup 40 | echo "export COOK_TEST_DOCKER_IMAGE=gcr.io/google-containers/alpine-with-bash:1.0" | tee -a $bashrc 41 | echo "export COOK_TEST_DOCKER_WORKING_DIRECTORY=/mnt/sandbox" | tee -a $bashrc 42 | echo "export COOK_TEST_DISALLOW_POOLS_REGEX='(?!^k8s-(alpha)$)'" | tee -a $bashrc 43 | echo "export COOK_TEST_DEFAULT_SUBMIT_POOL=k8s-alpha" | tee -a $bashrc 44 | echo "export COOK_TEST_COMPUTE_CLUSTER_TYPE=kubernetes" | tee -a $bashrc 45 | echo "export COOK_TEST_DEFAULT_TIMEOUT_MS=480000" | tee -a $bashrc 46 | echo "export COOK_TEST_DEFAULT_WAIT_INTERVAL_MS=8000" | tee -a $bashrc 47 | cd $repo_root/integration || exit 1 48 | pip3 install -r requirements.txt 49 | 50 | # Cook Scheduler CLI setup 51 | cli=$repo_root/cli 52 | cd $cli || exit 1 53 | pip3 install -e . 54 | rm -f $HOME/.cs.json 55 | ln -s $cli/.cs.json $HOME/.cs.json 56 | 57 | sudo service postgresql restart 58 | SCRIPT 59 | config.vm.provision "bootstrap_as_vagrant", type: "shell", inline: $script, privileged: false 60 | end 61 | -------------------------------------------------------------------------------- /cla/Project Cook_Corporate_Contributor_License_Agreement.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/cla/Project Cook_Corporate_Contributor_License_Agreement.docx -------------------------------------------------------------------------------- /cla/Project Cook_Individual_Contributor_License_Agreement.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/cla/Project Cook_Individual_Contributor_License_Agreement.docx -------------------------------------------------------------------------------- /cli/.cs.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaults": { 3 | "submit": { 4 | "mem": 128, 5 | "cpus": 1, 6 | "max-retries": 1, 7 | "cluster": "dev1" 8 | } 9 | }, 10 | "clusters": [ 11 | { 12 | "name": "dev0", 13 | "url": "http://127.0.0.1:12321/", 14 | "disabled": false 15 | }, 16 | { 17 | "name": "dev1", 18 | "url": "http://127.0.0.1:22321/", 19 | "disabled": true 20 | } 21 | ], 22 | "metrics": { 23 | "disabled": true, 24 | "host": "localhost", 25 | "port": 8125, 26 | "line-formats": { 27 | "count": "{namespace}.{name}:{value}|c" 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /cli/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | *.spec 4 | build 5 | dist 6 | venv 7 | Pipfile 8 | virtualenv* 9 | -------------------------------------------------------------------------------- /cli/cook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/cli/cook/__init__.py -------------------------------------------------------------------------------- /cli/cook/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Module implementing a CLI for the Cook scheduler API. """ 3 | 4 | import logging 5 | import signal 6 | import sys 7 | 8 | from cook import util 9 | from cook.cli import run 10 | from cook.util import print_error 11 | 12 | 13 | def main(args=None, plugins={}): 14 | if args is None: 15 | args = sys.argv[1:] 16 | 17 | try: 18 | result = run(args, plugins) 19 | sys.exit(result) 20 | except Exception as e: 21 | logging.exception('exception when running with %s' % args) 22 | print_error(str(e)) 23 | sys.exit(1) 24 | 25 | 26 | def sigint_handler(_, __): 27 | """ 28 | Sets util.quit_running to True (which is read by other 29 | threads to determine when to stop), and then exits. 30 | """ 31 | util.quit_running = True 32 | print('Exiting...') 33 | sys.exit(0) 34 | 35 | 36 | signal.signal(signal.SIGINT, sigint_handler) 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /cli/cook/dateparser.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | PATTERN_TO_TIMEDELTA_FN = ( 5 | (r'^(\d+) sec(?:ond)?s? ago$', lambda x: datetime.timedelta(seconds=x)), 6 | (r'^(\d+) min(?:ute)?s? ago$', lambda x: datetime.timedelta(minutes=x)), 7 | (r'^(\d+) hours? ago$', lambda x: datetime.timedelta(hours=x)), 8 | (r'^(\d+) days? ago$', lambda x: datetime.timedelta(days=x)), 9 | (r'^(\d+) weeks? ago$', lambda x: datetime.timedelta(weeks=x)) 10 | ) 11 | 12 | 13 | def parse(date_time_string, time_zone): 14 | """ 15 | Parses the given date_time_string and constructs a datetime object. 16 | Accepts strings in the following formats, where x is any integer: 17 | 18 | - now 19 | - today 20 | - yesterday 21 | - x seconds ago 22 | - x minutes ago 23 | - x hours ago 24 | - x days ago 25 | - x weeks ago 26 | - any format supported by dateutil's parser 27 | 28 | Why did we roll our own datetime parsing function? 29 | The existing libraries that do this sort of parsing also provide 30 | additional features such as multi-language support which: 31 | 32 | - add complexity we don't want 33 | - slow them down 34 | - make pyinstaller compatibility hard or impossible 35 | """ 36 | date_time_string = date_time_string.strip() 37 | date_time_string_lower = date_time_string.lower() 38 | now = datetime.datetime.now(tz=time_zone) 39 | 40 | if date_time_string_lower in ('now', 'today'): 41 | return now 42 | 43 | if date_time_string_lower == 'yesterday': 44 | return now - datetime.timedelta(days=1) 45 | 46 | import re 47 | for pattern, timedelta_fn in PATTERN_TO_TIMEDELTA_FN: 48 | match = re.match(pattern, date_time_string_lower) 49 | if match: 50 | return now - timedelta_fn(int(match.groups()[0])) 51 | 52 | try: 53 | from dateutil import parser 54 | dt = parser.parse(date_time_string, ignoretz=False) 55 | if dt: 56 | if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None: 57 | dt = time_zone.localize(dt) 58 | return dt 59 | except ValueError as ve: 60 | logging.exception(ve) 61 | 62 | return None 63 | -------------------------------------------------------------------------------- /cli/cook/exceptions.py: -------------------------------------------------------------------------------- 1 | class CookRetriableException(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /cli/cook/format.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import humanfriendly 4 | 5 | from cook import terminal 6 | from cook.util import millis_to_timedelta, millis_to_date_string 7 | 8 | 9 | def format_dict(d): 10 | """Formats the given dictionary for display in a table""" 11 | return ' '.join(['%s=%s' % (k, v) for k, v in sorted(d.items())]) if len(d) > 0 else '(empty)' 12 | 13 | 14 | def format_list(l): 15 | """Formats the given list for display in a table""" 16 | return '; '.join([format_dict(x) if isinstance(x, dict) else str(x) for x in l]) if len(l) > 0 else '(empty)' 17 | 18 | 19 | def format_state(state): 20 | """Capitalizes and colorizes the given state""" 21 | state = state.capitalize() 22 | if state == 'Running': 23 | text = terminal.running(state) 24 | elif state == 'Waiting': 25 | text = terminal.waiting(state) 26 | elif state == 'Failed': 27 | text = terminal.failed(state) 28 | elif state == 'Success': 29 | text = terminal.success(state) 30 | else: 31 | text = state 32 | return text 33 | 34 | 35 | def format_instance_status(instance): 36 | """Formats the instance status field""" 37 | status_text = format_state(instance['status']) 38 | 39 | if 'reason_string' in instance: 40 | reason_text = f' ({terminal.reason(instance["reason_string"])})' 41 | else: 42 | reason_text = '' 43 | 44 | if 'progress' in instance and instance['progress'] > 0: 45 | if 'progress_message' in instance: 46 | progress_text = f' ({instance["progress"]}% {terminal.bold(instance["progress_message"])})' 47 | else: 48 | progress_text = f' ({instance["progress"]}%)' 49 | else: 50 | progress_text = '' 51 | 52 | return f'{status_text}{reason_text}{progress_text}' 53 | 54 | 55 | def format_instance_run_time(instance): 56 | """Formats the instance run time field""" 57 | if 'end_time' in instance: 58 | end = instance['end_time'] 59 | else: 60 | end = int(round(time.time() * 1000)) 61 | run_time = millis_to_timedelta(end - instance['start_time']) 62 | return '%s (started %s)' % (run_time, millis_to_date_string(instance['start_time'])) 63 | 64 | 65 | def format_job_status(job): 66 | """Formats the job status field""" 67 | return format_state(job['state']) 68 | 69 | 70 | def format_memory_amount(mebibytes): 71 | """Formats an amount, in MiB, to be human-readable""" 72 | return humanfriendly.format_size(mebibytes * 1024 * 1024, binary=True) 73 | 74 | 75 | def format_job_memory(job): 76 | """Formats the job memory field""" 77 | return format_memory_amount(job['mem']) 78 | 79 | 80 | def format_job_attempts(job): 81 | """Formats the job attempts field (e.g. 2 / 5)""" 82 | return '%s / %s' % (job['max_retries'] - job['retries_remaining'], job['max_retries']) 83 | -------------------------------------------------------------------------------- /cli/cook/plugins.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | __plugins = {} 4 | 5 | class SubCommandPlugin: 6 | """Base class to implement custom subcommands.""" 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def register(self, add_parser, add_defaults): 12 | """Register this subcommand with argparse. 13 | 14 | Must be implemented by the subclass extending SubCommandPlugin. 15 | """ 16 | raise NotImplementedError 17 | 18 | def run(self, clusters, args, config_path): 19 | """Run the subcommand. 20 | 21 | Must be implemented by the subclass extending SubCommandPlugin. 22 | """ 23 | raise NotImplementedError 24 | 25 | def name(): 26 | """Return the shortname of the subcommand. 27 | 28 | This shortname is used to register this subcommand in the list 29 | of supported actions. It cannot clash with an existing core 30 | subcommand or other plugin based subcommands. 31 | 32 | Must be implemented by the subclass extended SubCommandPlugin. 33 | """ 34 | raise NotImplementedError 35 | 36 | def configure(plugins): 37 | """Configures global plugins to the plugins map""" 38 | global __plugins 39 | __plugins = plugins 40 | logging.debug('plugins: %s', __plugins) 41 | 42 | 43 | def get_fn(plugin_name, default_fn): 44 | """Returns the plugin function corresponding to the given plugin name if found, otherwise, default_fn""" 45 | return __plugins.get(plugin_name, default_fn) 46 | -------------------------------------------------------------------------------- /cli/cook/progress.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | from cook import terminal 4 | from cook.util import print_info 5 | 6 | data = [] 7 | lock = threading.Lock() 8 | 9 | 10 | def __print_state(lines_to_move_up): 11 | """ 12 | "Refreshes" the state on the terminal by moving the cursor up 13 | lines_to_move_up lines and then printing the current state of the data 14 | list, which contains [item, status] pairs. 15 | """ 16 | print_info(terminal.MOVE_UP * lines_to_move_up, end='') 17 | print_info('\n'.join([f'{item} ... {state}' for [item, state] in data])) 18 | 19 | 20 | def add(item): 21 | """ 22 | Adds a new item (with blank status) and prints the new state. 23 | """ 24 | with lock: 25 | index = len(data) 26 | data.append([item, '']) 27 | __print_state(index) 28 | return index 29 | 30 | 31 | def update(index, status): 32 | """ 33 | Updates the status of the item with the given index and prints the new state. 34 | """ 35 | with lock: 36 | data[index][1] = status 37 | __print_state(len(data)) 38 | -------------------------------------------------------------------------------- /cli/cook/subcommands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/cli/cook/subcommands/__init__.py -------------------------------------------------------------------------------- /cli/cook/subcommands/wait.py: -------------------------------------------------------------------------------- 1 | from cook.querying import print_no_data, parse_entity_refs, query_with_stdin_support 2 | from cook.util import print_info, seconds_to_timedelta, guard_no_cluster 3 | 4 | 5 | def all_jobs_completed(jobs): 6 | """Returns jobs if they are all completed, otherwise False.""" 7 | if all(j.get('status') == 'completed' for j in jobs): 8 | return jobs 9 | else: 10 | return False 11 | 12 | 13 | def all_instances_completed(instances): 14 | """Returns instances if they are all completed, otherwise False.""" 15 | if all(i.get('status') == 'completed' for i in instances): 16 | return instances 17 | else: 18 | return False 19 | 20 | 21 | def all_groups_completed(groups): 22 | """Returns groups if they are all completed, otherwise False.""" 23 | if all(len(g.get('jobs')) == g.get('completed') for g in groups): 24 | return groups 25 | else: 26 | return False 27 | 28 | 29 | def wait(clusters, args, _): 30 | """Waits for jobs / instances / groups with the given UUIDs to complete.""" 31 | guard_no_cluster(clusters) 32 | timeout = args.get('timeout') 33 | interval = args.get('interval') 34 | entity_refs, _ = parse_entity_refs(clusters, args.get('uuid')) 35 | timeout_text = ('up to %s' % seconds_to_timedelta(timeout)) if timeout else 'indefinitely' 36 | print_info('Will wait %s.' % timeout_text) 37 | query_result, clusters_of_interest = query_with_stdin_support(clusters, entity_refs, all_jobs_completed, 38 | all_instances_completed, all_groups_completed, 39 | timeout, interval) 40 | if query_result['count'] > 0: 41 | return 0 42 | else: 43 | print_no_data(clusters_of_interest) 44 | return 1 45 | 46 | 47 | def register(add_parser, add_defaults): 48 | """Adds this sub-command's parser and returns the action function""" 49 | default_timeout = None 50 | default_timeout_text = 'wait indefinitely' 51 | default_interval = 15 52 | wait_parser = add_parser('wait', help='wait for jobs / instances / groups to complete by uuid') 53 | wait_parser.add_argument('uuid', nargs='*') 54 | wait_parser.add_argument('--timeout', '-t', 55 | help=f'maximum time (in seconds) to wait (default = {default_timeout_text})', type=int) 56 | wait_parser.add_argument('--interval', '-i', 57 | help=f'time (in seconds) to wait between polling (default = {default_interval})', type=int) 58 | 59 | add_defaults('wait', {'timeout': default_timeout, 'interval': default_interval}) 60 | 61 | return wait 62 | -------------------------------------------------------------------------------- /cli/cook/terminal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import textwrap 5 | 6 | 7 | MOVE_UP = '\033[F' 8 | 9 | 10 | class Color: 11 | PURPLE = '\033[95m' 12 | CYAN = '\033[96m' 13 | DARKCYAN = '\033[36m' 14 | BLUE = '\033[94m' 15 | GREEN = '\033[92m' 16 | YELLOW = '\033[93m' 17 | RED = '\033[91m' 18 | BOLD = '\033[1m' 19 | UNDERLINE = '\033[4m' 20 | END = '\033[0m' 21 | 22 | 23 | def failed(s): 24 | return colorize(s, Color.BOLD + Color.RED) 25 | 26 | 27 | def success(s): 28 | return colorize(s, Color.GREEN) 29 | 30 | 31 | def running(s): 32 | return colorize(s, Color.CYAN) 33 | 34 | 35 | def waiting(s): 36 | return colorize(s, Color.YELLOW) 37 | 38 | 39 | def reason(s): 40 | return colorize(s, Color.RED) 41 | 42 | 43 | def bold(s): 44 | return colorize(s, Color.BOLD) 45 | 46 | 47 | wrap = textwrap.wrap 48 | 49 | 50 | def colorize(s, color): 51 | """Formats the given string with the given color""" 52 | return color + s + Color.END if tty() else s 53 | 54 | 55 | def __ls_color(s, code, fallback_fn): 56 | """ 57 | Parses the LS_COLORS environment variable to get consistent colors with the 58 | user's current setup, falling back to default formatting if the parsing fails 59 | """ 60 | if tty() and 'LS_COLORS' in os.environ: 61 | split_pairs = [p.split('=') for p in os.environ['LS_COLORS'].split(':')] 62 | matched_pairs = [p for p in split_pairs if len(p) == 2 and p[0] == code] 63 | if len(matched_pairs) > 0: 64 | return f'\033[{matched_pairs[0][1]}m{s}\033[0;0m' 65 | 66 | return fallback_fn(s) 67 | 68 | 69 | def tty(): 70 | """Returns true if running in a real terminal (as opposed to being piped or redirected)""" 71 | return sys.stdout.isatty() 72 | 73 | 74 | def directory(s): 75 | """Attempts to use the "di" entry in LS_COLORS, falling back to cyan""" 76 | return __ls_color(s, 'di', lambda t: colorize(t, Color.CYAN)) 77 | 78 | 79 | def executable(s): 80 | """Attempts to use the "ex" entry in LS_COLORS, falling back to green""" 81 | return __ls_color(s, 'ex', lambda t: colorize(t, Color.GREEN)) 82 | -------------------------------------------------------------------------------- /cli/cook/version.py: -------------------------------------------------------------------------------- 1 | VERSION = '3.9.5' 2 | -------------------------------------------------------------------------------- /cli/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | cli: Cook CLI tests 4 | -------------------------------------------------------------------------------- /cli/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import setup 4 | 5 | from cook import version 6 | 7 | requirements = [ 8 | 'arrow', 9 | 'blessed', 10 | 'humanfriendly', 11 | 'python-dateutil>=2.8.1', 12 | 'pytz', 13 | 'requests', 14 | 'tabulate', 15 | 'tenacity', 16 | 'tzlocal', 17 | ] 18 | 19 | test_requirements = [ 20 | 'freezegun', 21 | 'pytest', 22 | 'requests-mock', 23 | ] 24 | 25 | setup( 26 | name='cook_client', 27 | version=version.VERSION, 28 | description="Two Sigma's Cook CLI", 29 | long_description="This package contains Two Sigma's Cook Scheduler command line interface, cs. cs allows you to " 30 | "submit jobs and view jobs, job instances, and job groups across multiple Cook clusters.", 31 | packages=['cook', 'cook.subcommands'], 32 | entry_points={'console_scripts': ['cs = cook.__main__:main']}, 33 | install_requires=requirements, 34 | tests_require=test_requirements 35 | ) 36 | -------------------------------------------------------------------------------- /cli/tests/subcommands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/cli/tests/subcommands/__init__.py -------------------------------------------------------------------------------- /cli/tests/subcommands/test_querying.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unittest 3 | import uuid 4 | 5 | import pytest 6 | import requests 7 | import requests.adapters 8 | import requests_mock 9 | from cook import http 10 | 11 | from cook.querying import query_cluster, make_job_request 12 | 13 | 14 | @pytest.mark.cli 15 | class CookCliTest(unittest.TestCase): 16 | _multiprocess_can_split_ = True 17 | 18 | def setUp(self): 19 | self.logger = logging.getLogger(__name__) 20 | 21 | def test_query_cluster_should_gracefully_handle_json_parsing_failures(self): 22 | http_plugins = { 23 | 'http-adapter-factory': requests.adapters.HTTPAdapter, 24 | 'http-session-factory': requests.Session, 25 | } 26 | http.configure(config={}, plugins=http_plugins) 27 | cluster = {'url': 'http://localhost'} 28 | uuids = [uuid.uuid4()] 29 | with requests_mock.mock() as m: 30 | m.get('http://localhost/rawscheduler', text='this is not json') 31 | self.assertEqual([], query_cluster(cluster, uuids, None, None, None, make_job_request, 'job')) 32 | -------------------------------------------------------------------------------- /cli/travis/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cli_dir="$(dirname "$( dirname "${BASH_SOURCE[0]}" )" )" 4 | cd "$cli_dir" 5 | 6 | # Don't use --user in virtualenv 7 | if [[ "$(pip -V)" != *${HOME}* ]]; then 8 | pip_flags=--user 9 | else 10 | pip_flags= 11 | fi 12 | 13 | # Parse dependencies from setup.py 14 | dependencies="$(sed -nE "s/^\\s+'([^']+)',\$/\\1/p" < setup.py)" 15 | 16 | pip install $pip_flags $dependencies 17 | -------------------------------------------------------------------------------- /executor/.dockerignore: -------------------------------------------------------------------------------- 1 | .cache 2 | .eggs 3 | .idea 4 | bin 5 | build 6 | cook_executor.egg-info 7 | cook-executor.spec 8 | dist 9 | executor.iml 10 | help.spec 11 | tests 12 | travis 13 | -------------------------------------------------------------------------------- /executor/.gitignore: -------------------------------------------------------------------------------- 1 | # JetBrains IDE files 2 | .idea 3 | *.iml 4 | 5 | # Python setup 6 | build 7 | dist 8 | .cache 9 | .eggs 10 | __pycache__ 11 | *.egg-info 12 | *.spec 13 | 14 | # Log file 15 | executor.log 16 | 17 | 18 | # virtualenv's 19 | virtualenv* 20 | venv* 21 | -------------------------------------------------------------------------------- /executor/Dockerfile.build: -------------------------------------------------------------------------------- 1 | FROM python:3.5.9-stretch 2 | 3 | RUN pip install pyinstaller==3.3 4 | 5 | RUN mkdir /opt/cook 6 | WORKDIR /opt/cook 7 | 8 | COPY requirements.txt /opt/cook/ 9 | RUN pip install -r requirements.txt 10 | 11 | COPY . /opt/cook 12 | 13 | # Create a one-folder bundle containing an executable (instead of using the one-file version). 14 | # Allows us to avoid the extraction to a temporary folder needed by the PyInstaller bootloader. 15 | CMD ["pyinstaller", "--onedir", "--name", "cook-executor", "--paths", "cook", "cook/__main__.py"] 16 | -------------------------------------------------------------------------------- /executor/RELEASING.md: -------------------------------------------------------------------------------- 1 | Releasing Cook Executor 2 | ======================= 3 | 4 | Cook Executor is released on [PyPI](https://pypi.org/project/cook-executor/) 5 | 6 | Prerequisites 7 | ------------- 8 | Ensure you can build the executor followng the instructions in README.md 9 | 10 | Install `twine`: 11 | ```bash 12 | pip3 install twine 13 | ``` 14 | 15 | Test Release 16 | ------------ 17 | Since PyPI does not allow modifying releases, it can be useful to test a release using their test instance. 18 | ```bash 19 | rm -rf dist/* 20 | python3 setup.py sdist bdist_wheel 21 | python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* 22 | ``` 23 | Then, in a separate virtualenv for testing: 24 | ```bash 25 | pip3 install --index-url https://test.pypi.org/simple/ --no-deps cook-executor==$VERSION 26 | pip3 install pymesos==0.3.9 # install any other required dependencies from the main pypi repo 27 | cook-executor 28 | ``` 29 | If there is an issue with the release, you can just release another version. They are GC-ed periodically from the test instance. 30 | 31 | Production Release 32 | ------------------ 33 | When you're ready to release the final version, just build and upload to the standard PyPI repo. 34 | ```bash 35 | rm -rf dist/* 36 | python3 setup.py sdist bdist_wheel 37 | python3 -m twine upload dist/* 38 | ``` 39 | -------------------------------------------------------------------------------- /executor/bin/build-docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: ./bin/build-docker.sh 4 | # Builds the version of cook executor that can execute inside a docker container. 5 | 6 | set -e 7 | 8 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 9 | NAME=cook-executor-build 10 | 11 | EXECUTOR_DIR="$(dirname ${DIR})" 12 | 13 | mkdir -p ${EXECUTOR_DIR}/dist 14 | 15 | # build cook-executor inside docker image to avoid local python environment and architecture hassles 16 | cd ${EXECUTOR_DIR} 17 | docker build -t ${NAME} -f ${EXECUTOR_DIR}/Dockerfile.build . 18 | docker run --name ${NAME} ${NAME} 19 | rm -rf ${EXECUTOR_DIR}/dist/cook-executor 20 | docker cp ${NAME}:/opt/cook/dist/cook-executor ${EXECUTOR_DIR}/dist/cook-executor 21 | docker rm ${NAME} 22 | -------------------------------------------------------------------------------- /executor/bin/build-local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: ./bin/build-local.sh 4 | # Builds the version of cook executor that can execute locally. 5 | 6 | set -e 7 | 8 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 9 | NAME=cook-executor-build 10 | 11 | EXECUTOR_DIR="$(dirname ${DIR})" 12 | 13 | mkdir -p ${EXECUTOR_DIR}/dist 14 | rm -rf ${EXECUTOR_DIR}/dist/cook-executor-local 15 | 16 | # Create a one-folder bundle containing an executable (instead of using the one-file version). 17 | # Allows us to avoid the extraction to a temporary folder needed by the PyInstaller bootloader. 18 | cd ${EXECUTOR_DIR} 19 | pyinstaller --onedir --name cook-executor-local --paths cook cook/__main__.py 20 | -------------------------------------------------------------------------------- /executor/bin/prepare-executor.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # USAGE: ./bin/prepare-executor.sh MODE TARGET_DIR 4 | # Builds the cook executor and then copies it to TARGET_DIR 5 | # Examples: 6 | # ./bin/prepare-executor.sh docker /target/directory 7 | # ./bin/prepare-executor.sh local /target/directory 8 | 9 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 10 | EXECUTOR_DIR="$(dirname ${DIR})" 11 | MODE=${1} 12 | TARGET_DIR=${2} 13 | 14 | set -e 15 | 16 | if [ -z "${MODE}" ]; then 17 | echo "ERROR: mode has not been specified!" 18 | exit 1 19 | fi 20 | 21 | if [[ ! "${MODE}" =~ ^(docker|local)$ ]]; then 22 | echo "ERROR: invalid mode (${MODE}) specified!" 23 | exit 1 24 | fi 25 | 26 | if [ -z "${TARGET_DIR}" ]; then 27 | echo "ERROR: target directory has not been specified!" 28 | exit 1 29 | fi 30 | 31 | COOK_EXECUTOR_NAME="cook-executor-${MODE}" 32 | if [[ "${MODE}" == docker ]]; then 33 | COOK_EXECUTOR_NAME="cook-executor" 34 | fi 35 | 36 | COOK_EXECUTOR_PATH="${EXECUTOR_DIR}/dist/${COOK_EXECUTOR_NAME}" 37 | if [ ! -d ${COOK_EXECUTOR_PATH} ]; then 38 | echo "${COOK_EXECUTOR_NAME} not found at ${COOK_EXECUTOR_PATH}" 39 | DO_EXECUTOR_REBUILD=true 40 | elif ! ${EXECUTOR_DIR}/bin/check-version.sh -q ${COOK_EXECUTOR_NAME}; then 41 | echo "${COOK_EXECUTOR_NAME} appears to be out of date" 42 | DO_EXECUTOR_REBUILD=true 43 | else 44 | DO_EXECUTOR_REBUILD=false 45 | fi 46 | 47 | COOK_EXECUTOR_ZIP_NAME="${COOK_EXECUTOR_NAME}.tar.gz" 48 | COOK_EXECUTOR_ZIP_FILE="${EXECUTOR_DIR}/dist/${COOK_EXECUTOR_ZIP_NAME}" 49 | if $DO_EXECUTOR_REBUILD; then 50 | echo "Triggering build of ${COOK_EXECUTOR_NAME} before proceeding." 51 | ${EXECUTOR_DIR}/bin/build-${MODE}.sh 52 | echo "Zipping contents of ${COOK_EXECUTOR_PATH}" 53 | pushd ${EXECUTOR_DIR}/dist 54 | tar -cvzf ${COOK_EXECUTOR_ZIP_FILE} ${COOK_EXECUTOR_NAME} 55 | popd 56 | else 57 | echo "Not triggering build of ${COOK_EXECUTOR_NAME}" 58 | fi 59 | 60 | 61 | if [ "${COOK_EXECUTOR_ZIP_FILE}" -nt "${TARGET_DIR}/${COOK_EXECUTOR_ZIP_NAME}" ]; then 62 | echo "Copying ${COOK_EXECUTOR_ZIP_NAME} from ${COOK_EXECUTOR_ZIP_FILE} to ${TARGET_DIR}" 63 | mkdir -p ${TARGET_DIR} 64 | cp -f ${COOK_EXECUTOR_ZIP_FILE} ${TARGET_DIR} 65 | else 66 | echo "Not copying ${COOK_EXECUTOR_ZIP_NAME} to ${TARGET_DIR}" 67 | fi 68 | -------------------------------------------------------------------------------- /executor/cook/__init__.py: -------------------------------------------------------------------------------- 1 | """Cook Executor 2 | 3 | The Cook executor is a custom executor written in Python. 4 | It replaces the default command executor in order to enable a number of 5 | features for both operators and end users. 6 | For more information on Mesos executors, see the "Working with Executors" 7 | section at http://mesos.apache.org/documentation/latest/app-framework-development-guide/ 8 | """ 9 | 10 | DAEMON_GRACE_SECS = 1 11 | TERMINATE_GRACE_SECS = 0.1 12 | 13 | REASON_CONTAINER_LIMITATION_MEMORY = 'REASON_CONTAINER_LIMITATION_MEMORY' 14 | REASON_EXECUTOR_TERMINATED = 'REASON_EXECUTOR_TERMINATED' 15 | REASON_TASK_INVALID = 'REASON_TASK_INVALID' 16 | 17 | TASK_ERROR = 'TASK_ERROR' 18 | TASK_FAILED = 'TASK_FAILED' 19 | TASK_FINISHED = 'TASK_FINISHED' 20 | TASK_KILLED = 'TASK_KILLED' 21 | TASK_RUNNING = 'TASK_RUNNING' 22 | TASK_STARTING = 'TASK_STARTING' 23 | -------------------------------------------------------------------------------- /executor/cook/_version.py: -------------------------------------------------------------------------------- 1 | # This file is read by setup.py to obtain the version. 2 | # Be aware that changing the format may break the parsing logic. 3 | 4 | __version__ = "0.1.16" 5 | -------------------------------------------------------------------------------- /executor/cook/io_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This module ensures atomic writes to stdout.""" 4 | 5 | import logging 6 | import sys 7 | from threading import Lock 8 | 9 | import os 10 | 11 | __stdout_lock__ = Lock() 12 | 13 | 14 | def print_to_buffer(lock, buffer, data, flush=False, newline=True): 15 | """Helper function that prints data to the specified buffer in a thread-safe manner using the lock. 16 | 17 | Parameters 18 | ---------- 19 | lock: threading.Lock 20 | The lock to use 21 | buffer: byte buffer 22 | The buffer to write to 23 | data: string or bytes 24 | The data to output 25 | flush: boolean 26 | Flag determining whether to trigger a sys.stdout.flush() 27 | newline: boolean 28 | Flag determining whether to output a newline at the end 29 | 30 | Returns 31 | ------- 32 | Nothing. 33 | """ 34 | with lock: 35 | if isinstance(data, str): 36 | buffer.write(data.encode()) 37 | else: 38 | buffer.write(data) 39 | if newline: 40 | buffer.write(os.linesep.encode()) 41 | if flush: 42 | buffer.flush() 43 | 44 | 45 | def print_out(data, flush=False, newline=True): 46 | """Wrapper function that prints to stdout in a thread-safe manner using the __stdout_lock__ lock. 47 | 48 | Parameters 49 | ---------- 50 | data: string or bytes 51 | The data to output 52 | flush: boolean 53 | Flag determining whether to trigger a sys.stdout.flush() 54 | newline: boolean 55 | Flag determining whether to output a newline at the end 56 | 57 | Returns 58 | ------- 59 | Nothing. 60 | """ 61 | print_to_buffer(__stdout_lock__, sys.stdout.buffer, data, flush=flush, newline=newline) 62 | 63 | 64 | def print_and_log(string_data, newline=True): 65 | """Wrapper function that prints and flushes to stdout in a locally thread-safe manner ensuring newline at the start. 66 | The function also outputs the same message via logging.info(). 67 | 68 | Parameters 69 | ---------- 70 | string_data: string 71 | The string to output 72 | newline: boolean 73 | Flag determining whether to output a newline at the end 74 | 75 | Returns 76 | ------- 77 | Nothing. 78 | """ 79 | print_out('{}{}'.format(os.linesep, string_data), flush=True, newline=newline) 80 | logging.info(string_data) 81 | -------------------------------------------------------------------------------- /executor/cook/util.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import logging 3 | import resource 4 | import sys 5 | import threading 6 | import traceback 7 | 8 | __rusage_denom_mb = 1024.0 9 | if sys.platform == 'darwin': 10 | # in OSX the output is in different units 11 | __rusage_denom_mb = __rusage_denom_mb * 1024 12 | 13 | 14 | def print_memory_usage(): 15 | """Logs the memory usage of the executor.""" 16 | try: 17 | max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 18 | logging.info('Executor Memory usage: {} MB'.format(max_rss / __rusage_denom_mb)) 19 | except Exception: 20 | logging.exception('Error in logging memory usage') 21 | 22 | 23 | def is_out_of_memory_error(exception): 24 | """Returns true iff exception is an instance of OSError and error code represents an out of memory error.""" 25 | return isinstance(exception, OSError) and exception.errno == errno.ENOMEM 26 | 27 | 28 | def log_thread_stack_traces(): 29 | """Logs the stack traces for all threads.""" 30 | try: 31 | logging.info('Logging stack traces for all threads') 32 | for th in threading.enumerate(): 33 | logging.info(th) 34 | logging.info(''.join(traceback.format_stack(sys._current_frames()[th.ident]))) 35 | except: 36 | logging.exception('Error in logging thread stack traces') 37 | -------------------------------------------------------------------------------- /executor/requirements.txt: -------------------------------------------------------------------------------- 1 | psutil==5.4.1 2 | pyinstaller==3.3 3 | pymesos==0.3.9 4 | -------------------------------------------------------------------------------- /executor/setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | addopts = -n 1 -v --timeout-method=thread 3 | timeout = 1200 4 | -------------------------------------------------------------------------------- /executor/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup 5 | 6 | test_deps=[ 7 | 'pytest==5.2.0', 8 | 'pytest-timeout==1.3.3', 9 | 'pytest-xdist==1.30.0' 10 | ] 11 | 12 | extras = { 'test': test_deps } 13 | 14 | setup( 15 | name='cook-executor', 16 | version=open("cook/_version.py").readlines()[-1].split('"')[1], 17 | description='Custom Mesos executor for Cook written in Python', 18 | url='https://github.com/twosigma/Cook', 19 | license="Apache Software License 2.0", 20 | keywords='cook-executor', 21 | packages=['cook'], 22 | test_suite='tests', 23 | tests_require=test_deps, 24 | extras_require=extras, 25 | install_requires=['psutil==5.4.1', 'pymesos==0.3.9'], 26 | entry_points={ 27 | 'console_scripts': [ 28 | 'cook-executor = cook.__main__:main' 29 | ] 30 | }, 31 | classifiers=[ 32 | "Programming Language :: Python :: 3.5" 33 | ] 34 | ) 35 | -------------------------------------------------------------------------------- /executor/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/executor/tests/__init__.py -------------------------------------------------------------------------------- /executor/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # This file is automatically loaded and run by pytest during its setup process, 2 | # meaning it happens before any of the tests in this directory are run. 3 | # See the pytest documentation on conftest files for more information: 4 | # https://docs.pytest.org/en/2.7.3/plugins.html#conftest-py-plugins 5 | 6 | # Please see: https://github.com/twosigma/Cook/issues/749 7 | 8 | import pymesos as pm 9 | 10 | pm.encode_data((str({'foo': 'bar'}).encode('utf8'))) 11 | -------------------------------------------------------------------------------- /executor/travis/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the Cook Executor tests 4 | 5 | export PROJECT_DIR=`pwd` 6 | cd ${PROJECT_DIR} 7 | 8 | python --version 9 | pytest --version 10 | 11 | pytest -n4 12 | -------------------------------------------------------------------------------- /executor/travis/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Sets up the travis worker to be able to run executor tests. 4 | 5 | export PROJECT_DIR=`pwd` 6 | cd ${PROJECT_DIR} 7 | 8 | python --version 9 | pip install -e '.[test]' 10 | -------------------------------------------------------------------------------- /integration/.dockerignore: -------------------------------------------------------------------------------- 1 | .eggs 2 | .idea 3 | .cache 4 | .minimesos 5 | .pytest_cache 6 | bin 7 | build 8 | cook_integration.egg-info 9 | dist 10 | integration.iml 11 | travis 12 | **/__pycache__ 13 | virtualenv* 14 | venv* 15 | -------------------------------------------------------------------------------- /integration/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | *.egg-info 4 | *.egg* 5 | .cache 6 | venv/ 7 | virtualenv-integrationtest* 8 | .pytest_cache 9 | /.cs.json 10 | -------------------------------------------------------------------------------- /integration/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | WORKDIR /opt/cook/integration 4 | COPY requirements.txt /opt/cook/integration 5 | ADD cli.tar.gz /opt/cook/cli/ 6 | RUN pip install -r requirements.txt 7 | 8 | # Don't need to copy over the integration test files --- they're bind-mounted. 9 | ENTRYPOINT ["pytest"] -------------------------------------------------------------------------------- /integration/bin/build-docker-image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: build-docker-image.sh 4 | # Builds a docker image containing the cook scheduler integration tests. 5 | 6 | INTEGRATION_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )" 7 | NAME=cook-integration 8 | 9 | echo "Building docker images for ${NAME} IN $(dirname ${INTEGRATION_DIR})/cli" 10 | cd $(dirname ${INTEGRATION_DIR})/cli 11 | tar -c . | gzip -n >${INTEGRATION_DIR}/cli.tar.gz 12 | cd ${INTEGRATION_DIR} 13 | docker build -t ${NAME} ${INTEGRATION_DIR} 14 | rm cli.tar.gz 15 | -------------------------------------------------------------------------------- /integration/bin/only-run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, pytest, sys 4 | 5 | if len(sys.argv) < 2: 6 | sys.exit("USAGE: {} TEST_NAMES...".format(sys.argv[0])) 7 | 8 | target_names = sys.argv[1:] 9 | 10 | class FindTestsPlugin(object): 11 | 12 | def __init__(self): 13 | self.matched = [] 14 | 15 | def pytest_collection_modifyitems(self, items): 16 | for target_name in target_names: 17 | suffix = ':' + target_name 18 | for item in items: 19 | if item.nodeid.endswith(suffix): 20 | self.matched.append(item.nodeid) 21 | 22 | find_tests = FindTestsPlugin() 23 | pytest.main(['-c/dev/null', '--collect-only', '-p', 'no:terminal'], plugins=[find_tests]) 24 | 25 | if not find_tests.matched: 26 | sys.exit("No tests found with the given names.") 27 | 28 | os.execvp('pytest', ['-c/dev/null', '-n0', '-sv'] + find_tests.matched) 29 | -------------------------------------------------------------------------------- /integration/requirements.txt: -------------------------------------------------------------------------------- 1 | beakerx==1.3.0 2 | tornado==6.1.0 3 | jupyter_client==7.1.0 4 | nbconvert==6.3.0 5 | nbformat==5.1.3 6 | numpy==1.21.0 7 | pip==9.0.1; python_version >= '3.6' 8 | pytest==5.2.0 9 | pytest-timeout==1.3.3 10 | pytest-xdist==1.30.0 11 | python-dateutil==2.8.1 12 | requests==2.20.0 13 | retrying==1.3.3 14 | file:../cli#egg=cook_client 15 | pygit2==1.7.2 16 | -------------------------------------------------------------------------------- /integration/setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | addopts = -n10 -v --timeout-method=thread --maxfail=3 --log-level=DEBUG --durations=25 3 | timeout = 1200 4 | usefixtures = record_test_metric 5 | markers = 6 | cli: marks tests as testing the cs CLI 7 | memlimit: marks tests as checking that exceeding the memory limit works as expected 8 | multi_user: marks tests as using multiple users (e.g. one admin and one non-admin) 9 | scheduler_not_in_docker: marks tests that should be skipped when Cook itself runs in Docker 10 | serial: marks tests as needing to run in series rather than in parallel with other tests 11 | travis_skip: marks tests that should be skipped in Travis CI 12 | -------------------------------------------------------------------------------- /integration/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | logging.basicConfig(format='%(asctime)s [%(levelname)s] [%(process)d] %(message)s', level=logging.DEBUG) 5 | -------------------------------------------------------------------------------- /integration/tests/cook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/integration/tests/cook/__init__.py -------------------------------------------------------------------------------- /integration/tests/cook/conftest.py: -------------------------------------------------------------------------------- 1 | # This file is automatically loaded and run by pytest during its setup process, 2 | # meaning it happens before any of the tests in this directory are run. 3 | # See the pytest documentation on conftest files for more information: 4 | # https://docs.pytest.org/en/2.7.3/plugins.html#conftest-py-plugins 5 | import logging 6 | import os 7 | import socket 8 | import subprocess 9 | import threading 10 | import time 11 | 12 | from tests.cook import util 13 | 14 | 15 | def _sudo_check(user): 16 | """ 17 | Check if the current user can sudo as a test user. 18 | This is necessary to obtain Kerberos auth headers for multi-user tests. 19 | """ 20 | sudo_ok = (0 == subprocess.call(f'sudo -nu {user} echo CACHED SUDO', shell=True)) 21 | assert sudo_ok, "You need to pre-cache your sudo credentials. (Run a simple sudo command as a test user.)" 22 | 23 | 24 | def _sudo_checker_task(user): 25 | """Periodically check sudo ability to ensure the credentials stay cached.""" 26 | while True: 27 | _sudo_check(user) 28 | time.sleep(60) 29 | 30 | 31 | def _ssh_check(user): 32 | """ 33 | Check if the current user can ssh as a test user. 34 | This is necessary to obtain Kerberos auth headers for multi-user tests. 35 | """ 36 | hostname = os.getenv('COOK_SWITCH_USER_SSH_HOST', socket.gethostname()) 37 | logging.info(f'Checking ssh as {user} to {hostname}') 38 | ssh_ok = (0 == subprocess.call(f'ssh {user}@{hostname} echo SSH', shell=True)) 39 | assert ssh_ok, f'Unable to ssh as {user} to {hostname}' 40 | 41 | 42 | logging.info('Checking if multi-user switching needs to be enabled') 43 | if util.kerberos_enabled() and os.getenv('COOK_MAX_TEST_USERS'): 44 | switch_user_mode = os.getenv('COOK_SWITCH_USER_MODE', 'sudo') 45 | logging.info(f'Multi-user switching mode is {switch_user_mode}') 46 | if switch_user_mode == 'sudo': 47 | username = next(util._test_user_names()) 48 | _sudo_check(username) 49 | threading.Thread(target=_sudo_checker_task, args=[username], daemon=True).start() 50 | elif switch_user_mode == 'ssh': 51 | for username in util._test_user_names(): 52 | _ssh_check(username) 53 | else: 54 | assert False, f'{switch_user_mode} is not a valid value for COOK_SWITCH_USER_MODE' 55 | else: 56 | logging.info('Multi-user switching is not getting enabled') 57 | -------------------------------------------------------------------------------- /integration/tests/cook/reasons.py: -------------------------------------------------------------------------------- 1 | # Named constants for failure reason codes from cook or mesos. 2 | # See scheduler/src/cook/mesos/schema.clj for the reason code names. 3 | REASON_KILLED_BY_USER = 1001 4 | REASON_TASK_KILLED_DURING_LAUNCH = 1004 5 | MAX_RUNTIME_EXCEEDED = 2003 6 | CONTAINER_INITIALIZATION_TIMED_OUT = 1007 7 | EXECUTOR_UNREGISTERED = 6002 8 | UNKNOWN_MESOS_REASON = 99001 9 | CMD_NON_ZERO_EXIT = 99003 10 | 11 | # Named constants for unscheduled job reason strings from cook or fenzo. 12 | UNDER_INVESTIGATION = 'The job is now under investigation. Check back in a minute for more details!' 13 | COULD_NOT_PLACE_JOB = 'The job couldn\'t be placed on any available hosts.' 14 | JOB_WOULD_EXCEED_QUOTA = 'The job would cause you to exceed resource quotas.' 15 | JOB_IS_RUNNING_NOW = 'The job is running now.' 16 | JOB_LAUNCH_RATE_LIMIT = 'You are currently rate limited on how many jobs you launch per minute.' 17 | PLUGIN_IS_BLOCKING = 'The launch filter plugin is blocking the job launch.' 18 | -------------------------------------------------------------------------------- /integration/tests/cook/test_master_slave.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | import pytest 6 | from retrying import retry 7 | 8 | from tests.cook import util 9 | 10 | 11 | @unittest.skipUnless(os.getenv('COOK_MASTER_SLAVE') is not None, 12 | 'Requires setting the COOK_MASTER_SLAVE environment variable') 13 | @pytest.mark.timeout(util.DEFAULT_TEST_TIMEOUT_SECS) # individual test timeout 14 | class MasterSlaveTest(unittest.TestCase): 15 | 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.master_url = util.retrieve_cook_url() 19 | cls.slave_url = util.retrieve_cook_url('COOK_SLAVE_URL', 'http://localhost:12322') 20 | cls.logger = logging.getLogger(__name__) 21 | util.init_cook_session(cls.master_url, cls.slave_url) 22 | 23 | def setUp(self): 24 | self.master_url = type(self).master_url 25 | self.slave_url = type(self).slave_url 26 | self.logger = logging.getLogger(__name__) 27 | 28 | def test_get_queue(self): 29 | bad_constraint = [["HOSTNAME", 30 | "EQUALS", 31 | "lol won't get scheduled"]] 32 | uuid, resp = util.submit_job(self.master_url, command='sleep 30', constraints=bad_constraint) 33 | self.assertEqual(201, resp.status_code, resp.content) 34 | try: 35 | slave_queue = util.session.get('%s/queue' % self.slave_url, allow_redirects=False) 36 | self.assertEqual(307, slave_queue.status_code) 37 | default_pool = util.default_pool(self.master_url) 38 | pool = default_pool or 'no-pool' 39 | self.logger.info(f'Checking the queue endpoint for pool {pool}') 40 | 41 | @retry(stop_max_delay=30000, wait_fixed=1000) # Need to wait for a rank cycle 42 | def check_queue(): 43 | master_queue = util.session.get(slave_queue.headers['Location']) 44 | self.assertEqual(200, master_queue.status_code, master_queue.content) 45 | pool_queue = master_queue.json()[pool] 46 | self.assertTrue(any([job['job/uuid'] == uuid for job in pool_queue]), pool_queue) 47 | 48 | check_queue() 49 | finally: 50 | util.kill_jobs(self.master_url, [uuid]) 51 | -------------------------------------------------------------------------------- /integration/tests/cook/test_multi_cluster.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import unittest 4 | 5 | import logging 6 | 7 | from tests.cook import util 8 | 9 | 10 | @unittest.skipUnless(os.getenv('COOK_MULTI_CLUSTER') is not None, 11 | 'Requires setting the COOK_MULTI_CLUSTER environment variable') 12 | @pytest.mark.timeout(util.DEFAULT_TEST_TIMEOUT_SECS) # individual test timeout 13 | class MultiClusterTest(unittest.TestCase): 14 | 15 | @classmethod 16 | def setUpClass(cls): 17 | cls.cook_url_1 = util.retrieve_cook_url() 18 | cls.cook_url_2 = util.retrieve_cook_url('COOK_SCHEDULER_URL_2', 'http://localhost:22321') 19 | util.init_cook_session(cls.cook_url_1, cls.cook_url_2) 20 | 21 | def setUp(self): 22 | self.cook_url_1 = type(self).cook_url_1 23 | self.cook_url_2 = type(self).cook_url_2 24 | self.logger = logging.getLogger(__name__) 25 | 26 | def test_federated_query(self): 27 | # Submit to cluster #1 28 | job_uuid_1, resp = util.submit_job(self.cook_url_1) 29 | self.assertEqual(resp.status_code, 201) 30 | 31 | # Submit to cluster #2 32 | job_uuid_2, resp = util.submit_job(self.cook_url_2) 33 | self.assertEqual(resp.status_code, 201) 34 | 35 | # Ask for both jobs from cluster #1, expect to get the first 36 | resp = util.query_jobs(self.cook_url_1, uuid=[job_uuid_1, job_uuid_2], partial=True) 37 | self.assertEqual(200, resp.status_code, resp.json()) 38 | self.assertEqual(1, len(resp.json())) 39 | self.assertEqual([job_uuid_1], [job['uuid'] for job in resp.json()]) 40 | 41 | # Ask for both jobs from cluster #2, expect to get the second 42 | resp = util.query_jobs(self.cook_url_2, uuid=[job_uuid_1, job_uuid_2], partial=True) 43 | self.assertEqual(200, resp.status_code, resp.json()) 44 | self.assertEqual(1, len(resp.json())) 45 | self.assertEqual([job_uuid_2], [job['uuid'] for job in resp.json()]) 46 | -------------------------------------------------------------------------------- /integration/travis/prepare_integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | PROJECT_DIR=`pwd` ../travis/prepare.sh 6 | python --version 7 | 8 | # Explicitly uninstall cli 9 | if [[ $(pip list --format=columns | grep cook-client) ]]; 10 | then 11 | pip uninstall -y cook-client 12 | fi 13 | 14 | pip install -r requirements.txt 15 | -------------------------------------------------------------------------------- /jobclient/README.md: -------------------------------------------------------------------------------- 1 | # Cook Java Client 2 | 3 | Please run `mvn javadoc:javadoc` to build the docs for this project. 4 | The main entrypoint is `com.twosigma.cook.jobclient`; read the Javadocs for details. 5 | 6 | # Example Usage 7 | 8 | Submitting two jobs that should run in the same AWS region: 9 | 10 | ```java 11 | public class HostPlacementExample { 12 | @Test 13 | public void twoJobsInTheSameRegion() throws URISyntaxException, JobClientException { 14 | // Create a host placement constraint where the region attribute must equal across hosts 15 | HostPlacement.Builder hostPlacementBuilder = new HostPlacement.Builder(); 16 | hostPlacementBuilder.setType(HostPlacement.Type.ATTRIBUTE_EQUALS); 17 | hostPlacementBuilder.setParameter("attribute", "region"); 18 | HostPlacement hostPlacement = hostPlacementBuilder.build(); 19 | 20 | // Create a job group with the host placement constraint 21 | Group.Builder groupBuilder = new Group.Builder(); 22 | groupBuilder.setUUID(UUID.randomUUID()); 23 | groupBuilder.setName("testing"); 24 | groupBuilder.setHostPlacement(hostPlacement); 25 | Group group = groupBuilder.build(); 26 | 27 | // Create two jobs and place them in the job group 28 | Job.Builder jobBuilder = new Job.Builder(); 29 | jobBuilder.setCommand("echo hello"); 30 | jobBuilder.setCpus(1.0); 31 | jobBuilder.setMemory(128.0); 32 | jobBuilder.setGroup(group); 33 | jobBuilder.setUUID(UUID.randomUUID()); 34 | Job job1 = jobBuilder.build(); 35 | jobBuilder.setUUID(UUID.randomUUID()); 36 | Job job2 = jobBuilder.build(); 37 | 38 | // Create a job client and submit our jobs and job group 39 | JobClient.Builder clientBuilder = new JobClient.Builder(); 40 | clientBuilder.setHost("localhost"); 41 | clientBuilder.setPort(12321); 42 | clientBuilder.setJobEndpoint("rawscheduler"); 43 | JobClient client = clientBuilder.build(); 44 | client.submitWithGroups(Arrays.asList(job1, job2), Collections.singletonList(group)); 45 | } 46 | } 47 | ``` 48 | 49 | # Running the Tests 50 | 51 | The easiest way to run the JobClient unit tests is to use Maven: 52 | 53 | ```bash 54 | mvn dependency:resolve 55 | mvn test 56 | ``` 57 | 58 | © Two Sigma Open Source, LLC 59 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/Executor.java: -------------------------------------------------------------------------------- 1 | package com.twosigma.cook.jobclient; 2 | 3 | /** 4 | * Enum representing valid options for the executor field in a job and instance. 5 | */ 6 | 7 | public enum Executor { 8 | COOK, 9 | EXECUTOR; 10 | 11 | public static Executor fromString(final String name) { 12 | for (final Executor executor : values()) { 13 | if (executor.name().toLowerCase().equals(name)) { 14 | return executor; 15 | } 16 | } 17 | return null; 18 | } 19 | 20 | public String displayName() { 21 | return name().toLowerCase(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/GroupListener.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.twosigma.cook.jobclient; 18 | 19 | 20 | /** 21 | * Interface for group listener. 22 | *

23 | * Created: November 28, 2016 24 | * 25 | * @author diego 26 | */ 27 | public interface GroupListener { 28 | /** 29 | * The following method will be invoked in any of the following transitions: 30 | * INITIALIZED -> 1+ JOBS STILL RUNNING -> COMPLETED where it will receive a 31 | * {@link Group} object. 32 | *

33 | * Note that if any exception when {@link JobClient} invokes this method for a group status 34 | * update, it will just simply log this exception. It won't invoke this method for the 35 | * particular status update again. 36 | * 37 | * @param group 38 | */ 39 | public void onStatusUpdate(Group group); 40 | } 41 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/InstanceDecorator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.twosigma.cook.jobclient; 18 | 19 | /** 20 | * The interface of instance decorator which will take an instance builder as input and return a decorated instance 21 | * builder. 22 | * 23 | * Created: June 23, 2016 24 | * @author wzhao 25 | */ 26 | public interface InstanceDecorator { 27 | 28 | /** 29 | * @param builder The {@link Instance.Builder} expected to decorate. 30 | * @return a decorated instance builder. 31 | */ 32 | Instance.Builder decorate(Instance.Builder builder); 33 | } 34 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/JobClientException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | package com.twosigma.cook.jobclient; 19 | 20 | /** 21 | * Job client exception. 22 | *

23 | * Created: March 14, 2015 24 | * 25 | * @author wzhao 26 | */ 27 | public class JobClientException extends Exception { 28 | private static final long serialVersionUID = 1L; 29 | 30 | private final Integer httpResponseCode; 31 | 32 | JobClientException(final String msg) { 33 | this(msg, (Integer) null); 34 | } 35 | 36 | JobClientException(final String msg, final Throwable cause) { 37 | this(msg, cause, null); 38 | } 39 | 40 | JobClientException(final String msg, final Integer httpResponseCode) { 41 | super(msg); 42 | this.httpResponseCode = httpResponseCode; 43 | } 44 | 45 | 46 | JobClientException(final String msg, final Throwable cause, final Integer httpResponseCode) { 47 | super(msg, cause); 48 | this.httpResponseCode = httpResponseCode; 49 | } 50 | 51 | public Integer getHttpResponseCode() { 52 | return httpResponseCode; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/JobListener.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.twosigma.cook.jobclient; 18 | 19 | 20 | /** 21 | * Interface for job listener. 22 | *

23 | * Created: March 14, 2015 24 | * 25 | * @author wzhao 26 | */ 27 | public interface JobListener { 28 | /** 29 | * The following method will be invoked in any of the following job status transitions: 30 | * INITIALIZED -> WAITING, WAITING -> RUNNING, RUNNING -> COMPLETED where it will receive a 31 | * {@link Job} object with a possible status WAITING, RUNNING and COMPLETED respectively. 32 | *

33 | * Note that if any exception when {@link JobClient} invokes this method for a job status 34 | * update, it will just simply log this exception. It won't invoke this method for the 35 | * particular status update again. 36 | * 37 | * @param job 38 | */ 39 | public void onStatusUpdate(Job job); 40 | } 41 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/auth/spnego/GSSCredentialProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.twosigma.cook.jobclient.auth.spnego; 18 | 19 | import org.ietf.jgss.GSSCredential; 20 | 21 | /** 22 | * A simple {@link GSSCredential} provider could be used to hold or provide the latest valid 23 | * credential. 24 | *

25 | * Created: January 14, 2016 26 | * 27 | * @author wzhao 28 | */ 29 | public class GSSCredentialProvider { 30 | private GSSCredential _credential = null; 31 | 32 | /** 33 | * @return the {@link GSSCredential} held in this provider. If there is no credential held in 34 | * this hold, it will simply return null. 35 | */ 36 | public synchronized GSSCredential getCredential() { 37 | return _credential; 38 | } 39 | 40 | /** 41 | * @return update {@link GSSCredential} held in this provider. 42 | */ 43 | public synchronized void setCredential(GSSCredential credential) { 44 | _credential = credential; 45 | } 46 | 47 | /** 48 | * Clean the {@link GSSCredential} held in this provider. 49 | */ 50 | public synchronized void clear() { 51 | _credential = null; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/constraint/Constraint.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.twosigma.cook.jobclient.constraint; 18 | 19 | import org.json.JSONArray; 20 | import org.json.JSONException; 21 | 22 | /** 23 | * The interface to specify a constraint in Cook 24 | *

25 | * A constraint in Cook could be one of the following three forms 26 | *

31 | * Examples of constraints are 32 | * 37 | */ 38 | public interface Constraint { 39 | /** 40 | * @return this constraint as a JSONArray. 41 | */ 42 | JSONArray toJson() throws JSONException; 43 | 44 | /** 45 | * @return the attribute of this constraint. 46 | */ 47 | String getAttribute(); 48 | 49 | /** 50 | * @return the operator of this constraint. 51 | */ 52 | Operator getOperator(); 53 | } 54 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/constraint/OneToOneConstraint.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.twosigma.cook.jobclient.constraint; 17 | 18 | import org.json.JSONArray; 19 | import org.json.JSONException; 20 | 21 | import java.util.Objects; 22 | 23 | /** 24 | * A constraint of form 25 | * 28 | */ 29 | final class OneToOneConstraint implements Constraint { 30 | private Operator _operator; 31 | private String _attribute; 32 | private String _value; 33 | 34 | OneToOneConstraint(Operator operator, String attribute, String value) { 35 | _operator = operator; 36 | _attribute = attribute.trim(); 37 | _value = value.trim(); 38 | } 39 | 40 | @Override 41 | public boolean equals(Object o) { 42 | if (o == null) return false; 43 | if (o == this) return true; 44 | 45 | if (!(o instanceof OneToOneConstraint)) return false; 46 | 47 | OneToOneConstraint other = (OneToOneConstraint) o; 48 | 49 | if (!Objects.equals(this._operator, other._operator)) return false; 50 | if (!Objects.equals(this._attribute, other._attribute)) return false; 51 | if (!Objects.equals(this._value, other._value)) return false; 52 | 53 | return true; 54 | } 55 | 56 | @Override 57 | public int hashCode() { 58 | return Objects.hash(_operator, _attribute, _value); 59 | } 60 | 61 | @Override 62 | public JSONArray toJson() 63 | throws JSONException { 64 | JSONArray jsonArray = new JSONArray(); 65 | jsonArray.put(0, _attribute); 66 | jsonArray.put(1, _operator.toString()); 67 | jsonArray.put(2, _value); 68 | return jsonArray; 69 | } 70 | 71 | @Override 72 | public String getAttribute() { 73 | return _attribute; 74 | } 75 | 76 | @Override 77 | public Operator getOperator() { 78 | return _operator; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /jobclient/java/src/main/java/com/twosigma/cook/jobclient/constraint/Operator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.twosigma.cook.jobclient.constraint; 17 | 18 | public enum Operator { 19 | EQUALS("EQUALS"); 20 | 21 | Operator(String name) { 22 | } 23 | 24 | /** 25 | * Parse an operator from its string representation. 26 | * 27 | * @param op specifies a string representation of operator. 28 | * @return an operator for the specified name. 29 | */ 30 | public static Operator fromString(String op) { 31 | return Enum.valueOf(Operator.class, op.trim().toUpperCase()); 32 | } 33 | } -------------------------------------------------------------------------------- /jobclient/java/src/test/java/com/twosigma/ConstraintTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Two Sigma Open Source, LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.twosigma; 17 | 18 | import com.twosigma.cook.jobclient.constraint.Constraints; 19 | import com.twosigma.cook.jobclient.constraint.Constraint; 20 | import com.twosigma.cook.jobclient.constraint.Operator; 21 | import org.json.JSONArray; 22 | import org.junit.Assert; 23 | import org.junit.Test; 24 | 25 | public class ConstraintTest { 26 | 27 | @Test 28 | public void testScope() { 29 | Constraint c = Constraints.buildEqualsConstraint("bar", "foo"); 30 | Assert.assertEquals(c.getAttribute(), "bar"); 31 | Assert.assertEquals(c.getOperator(), Operator.EQUALS); 32 | Assert.assertEquals(c.toJson().getString(2), "foo"); 33 | } 34 | 35 | @Test 36 | public void testParseFrom() { 37 | String constraintString = "[bar,EQUALS,foo]"; 38 | Constraint parsedConstraint = Constraints.parseFrom(new JSONArray(constraintString)); 39 | Constraint expectedConstraint = Constraints.buildEqualsConstraint("bar", "foo"); 40 | Assert.assertEquals(parsedConstraint, expectedConstraint); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /jobclient/java/src/test/java/com/twosigma/cook/jobclient/FetchableURITest.java: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Copyright (c) Two Sigma Open Source, LLC 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.twosigma.cook.jobclient; 19 | 20 | import org.json.JSONException; 21 | import org.json.JSONObject; 22 | import org.junit.Assert; 23 | import org.junit.Before; 24 | import org.junit.Test; 25 | 26 | /** 27 | * Unit tests for {@link FetchableURI}. 28 | * 29 | * @author dgrnbrg 30 | */ 31 | public class FetchableURITest { 32 | 33 | private FetchableURI _uri; 34 | 35 | @Before 36 | public void setup() { 37 | final FetchableURI.Builder builder = new FetchableURI.Builder(); 38 | builder.setValue("http://example.com/myresource.sh"); 39 | builder.setExecutable(true); 40 | builder.setExtract(false); 41 | builder.setCache(true); 42 | _uri = builder.build(); 43 | } 44 | 45 | @Test 46 | public void testJsonizeURI() throws JSONException { 47 | final JSONObject json = FetchableURI.jsonizeUri(_uri); 48 | Assert.assertEquals(json.getString("value"), _uri.getValue()); 49 | Assert.assertEquals(json.getBoolean("executable"), _uri.isExecutable()); 50 | } 51 | 52 | @Test 53 | public void testParseFromJSON() throws JSONException { 54 | final JSONObject json = FetchableURI.jsonizeUri(_uri); 55 | Assert.assertEquals(FetchableURI.parseFromJSON(json), _uri); 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /jobclient/python/README.md: -------------------------------------------------------------------------------- 1 | # The Cook Scheduler Python Client API 2 | 3 | This package defines a client API for Cook Scheduler, allowing Python applications to easily integrate with Cook. 4 | 5 | ## Quickstart 6 | 7 | The code below shows how to use the client API to connect to a Cook cluster listening on `localhost:12321`, submit a job to the cluster, and query its information. 8 | 9 | ```python 10 | from cookclient import JobClient 11 | 12 | client = JobClient('localhost:12321') 13 | 14 | uuid = client.submit(command='ls') 15 | job = client.query(uuid) 16 | print(str(job)) 17 | ``` 18 | -------------------------------------------------------------------------------- /jobclient/python/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /jobclient/python/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /jobclient/python/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../..')) 16 | 17 | from cookclient import CLIENT_VERSION 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'Cook Python Client API' 23 | copyright = '2020, Two Sigma' 24 | author = 'Two Sigma' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = CLIENT_VERSION 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx.ext.autodoc', 37 | 'sphinx_rtd_theme' 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # List of patterns, relative to source directory, that match files and 44 | # directories to ignore when looking for source files. 45 | # This pattern also affects html_static_path and html_extra_path. 46 | exclude_patterns = [] 47 | 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_theme = 'sphinx_rtd_theme' 55 | 56 | # Add any paths that contain custom static files (such as style sheets) here, 57 | # relative to this directory. They are copied after the builtin static files, 58 | # so a file named "default.css" will overwrite the builtin "default.css". 59 | html_static_path = ['_static'] 60 | 61 | # Set the master_doc value, as readthedocs uses an older version of Sphinx 62 | # which will default to `contents` instead of `index`. 63 | master_doc = 'index' 64 | -------------------------------------------------------------------------------- /jobclient/python/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Cook Python Client API documentation master file, created by 2 | sphinx-quickstart on Mon Jun 8 10:47:38 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Cook Python Client API 7 | ====================== 8 | 9 | This is the official Python client library for interacting with Cook Scheduler. 10 | 11 | Quickstart 12 | ---------- 13 | 14 | The code below shows how to use the client API to connect to a Cook cluster 15 | listening on http://localhost:12321, submit a job to the cluster, and query its 16 | information. 17 | 18 | .. highlight:: python 19 | 20 | :: 21 | 22 | from cookclient import JobClient 23 | 24 | client = JobClient('localhost:12321') 25 | 26 | uuid = client.submit(command='ls') 27 | job = client.query(uuid) 28 | print(str(job)) 29 | 30 | 31 | 32 | .. toctree:: 33 | :maxdepth: 2 34 | :caption: Contents: 35 | 36 | usage 37 | api 38 | 39 | 40 | 41 | Indices and tables 42 | ================== 43 | 44 | * :ref:`genindex` 45 | * :ref:`modindex` 46 | * :ref:`search` 47 | -------------------------------------------------------------------------------- /jobclient/python/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | sphinx ~= 3.0.4 3 | sphinx-rtd-theme ~= 0.4.3 4 | -------------------------------------------------------------------------------- /jobclient/python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import setup 4 | 5 | from cookclient import CLIENT_VERSION 6 | 7 | with open('README.md') as fd: 8 | readme = fd.read() 9 | 10 | requirements = [ 11 | 'requests' 12 | ] 13 | 14 | setup(name='cook-client-api', 15 | version=CLIENT_VERSION, 16 | description="Cook Scheduler Client API for Python", 17 | long_description=readme, 18 | long_description_content_type='text/markdown', 19 | packages=['cookclient'], 20 | url='https://github.com/twosigma/Cook', 21 | install_requires=requirements, 22 | classifiers=[ 23 | "Development Status :: 3 - Alpha", 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: Apache Software License", 26 | "Operating System :: OS Independent" 27 | ], 28 | python_requires='>=3.6') 29 | -------------------------------------------------------------------------------- /scheduler/.dockerignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .lein-failures 3 | .minimesos 4 | .nrepl-port 5 | bin 6 | classes 7 | datomic/datomic*/data 8 | datomic/datomic*/log 9 | datomic/datomic*/lib/cook*.jar 10 | docs 11 | gclog.* 12 | log 13 | simulator_files 14 | target 15 | test 16 | test-log 17 | test-resources 18 | virtualenv* 19 | venv* 20 | -------------------------------------------------------------------------------- /scheduler/.gitignore: -------------------------------------------------------------------------------- 1 | .pytest_cache 2 | gclog* 3 | .calva/ 4 | -------------------------------------------------------------------------------- /scheduler/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mesosphere/mesos:1.3.0 2 | 3 | 4 | # Removing docker.list because docker APT repo has been deleted: 5 | # https://www.docker.com/blog/changes-dockerproject-org-apt-yum-repositories/ 6 | RUN rm /etc/apt/sources.list.d/docker.list && \ 7 | apt-get -y update && apt-get -y install software-properties-common && \ 8 | sudo apt-get install --reinstall ca-certificates && \ 9 | add-apt-repository ppa:openjdk-r/ppa && apt-get -y update && \ 10 | apt-get --no-install-recommends -y install \ 11 | curl \ 12 | openjdk-11-jdk \ 13 | unzip && apt-get clean && rm -Rf /var/lib/apt/lists/* 14 | 15 | # Env setup 16 | ENV HOME "/root/" 17 | ENV LEIN_ROOT true 18 | ENV MESOS_NATIVE_JAVA_LIBRARY /usr/lib/libmesos.so 19 | ENV JAVA_CMD=/usr/lib/jvm/java-11-openjdk-amd64/bin/java 20 | 21 | # Generate SSL certificate 22 | RUN mkdir /opt/ssl 23 | RUN keytool -genkeypair -keystore /opt/ssl/cook.p12 -storetype PKCS12 -storepass cookstore -dname "CN=cook, OU=Cook Developers, O=Two Sigma Investments, L=New York, ST=New York, C=US" -keyalg RSA -keysize 2048 24 | 25 | # Lein setup 26 | RUN mkdir $HOME/bin 27 | ENV PATH $PATH:$HOME/bin 28 | RUN curl -o $HOME/bin/lein https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein && chmod a+x $HOME/bin/lein && lein 29 | 30 | # Create and set the cook dir, copying project file 31 | COPY project.clj /opt/cook/ 32 | WORKDIR /opt/cook 33 | 34 | # Fetch dependencies 35 | ## Only copy the project.clj so that we can use the cached layer 36 | ## with fetched dependencies as long as project.clj isn't modified 37 | RUN lein deps 38 | 39 | # Datomic setup 40 | COPY datomic /opt/cook/datomic 41 | RUN unzip -uo /opt/cook/datomic/datomic-free-0.9.5561.56.zip 42 | 43 | # Copy the whole scheduler into the container 44 | COPY docker /opt/cook/docker 45 | COPY resources /opt/cook/resources 46 | COPY java /opt/cook/java 47 | COPY src /opt/cook/src 48 | 49 | RUN lein uberjar 50 | RUN cp "target/cook-$(lein print :version | tr -d '"').jar" datomic-free-0.9.5561.56/lib/cook-$(lein print :version | tr -d '"').jar 51 | COPY config* /opt/cook/ 52 | 53 | # Ugly hack. Our .cook_kubeconfig lookup assumes it can be found in ../scheduler/ so make a symlink 54 | RUN ln -s /opt/cook /opt/scheduler 55 | COPY .cook_kubeconfig_* /opt/cook/ 56 | 57 | # Run cook 58 | EXPOSE \ 59 | 4334 \ 60 | 4335 \ 61 | 4336 \ 62 | 12321 \ 63 | 12322 64 | ENTRYPOINT ["/opt/cook/docker/run-cook.sh"] 65 | CMD ["config.edn"] 66 | -------------------------------------------------------------------------------- /scheduler/api-only-config.edn: -------------------------------------------------------------------------------- 1 | {:api-only? true 2 | :authorization {:one-user #config/env "USER"} 3 | :authorization-config {;; These users have admin privileges when using configfile-admins-auth; 4 | ;; e.g., they can view and modify other users' jobs. 5 | :admins #{"admin" "root"} 6 | ;; What function should be used to perform user authorization? 7 | ;; See the docstring in cook.rest.authorization for details. 8 | :authorization-fn cook.rest.authorization/configfile-admins-auth-open-gets 9 | ;; users that are allowed to do things on behalf of others 10 | :impersonators #{"poser" "other-impersonator"}} 11 | :cors-origins ["https?://cors.example.com"] 12 | :database {:datomic-uri "datomic:mem://cook-jobs"} 13 | :hostname "cook-scheduler-12321" 14 | :log {:file "log/cook-12321.log" 15 | :levels {"datomic.db" :warn 16 | "datomic.kv-cluster" :warn 17 | "datomic.peer" :warn 18 | :default :info}} 19 | :metrics {:jmx true 20 | :user-metrics-interval-seconds 60} 21 | :nrepl {:enabled? true 22 | :port 8888} 23 | :pools {:default "mesos-gamma"} 24 | :port 12321 25 | :rate-limit {:user-limit-per-m 1000000} 26 | :unhandled-exceptions {:log-level :error}} 27 | -------------------------------------------------------------------------------- /scheduler/bin/build-docker-image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: build-docker-image.sh 4 | # Builds a docker image containing the cook scheduler. 5 | 6 | set -e 7 | 8 | SCHEDULER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )" 9 | NAME=cook-scheduler 10 | 11 | EXECUTOR_DIR="$(dirname ${SCHEDULER_DIR})/executor" 12 | EXECUTOR_NAME=cook-executor 13 | COOK_EXECUTOR_FILE=${EXECUTOR_DIR}/dist/${EXECUTOR_NAME} 14 | SCHEDULER_EXECUTOR_DIR=${SCHEDULER_DIR}/resources/public 15 | SCHEDULER_EXECUTOR_FILE=${SCHEDULER_EXECUTOR_DIR}/${EXECUTOR_NAME} 16 | 17 | ${EXECUTOR_DIR}/bin/prepare-executor.sh docker ${SCHEDULER_EXECUTOR_DIR} 18 | 19 | echo "Building docker images for ${NAME}" 20 | docker build -t ${NAME} ${SCHEDULER_DIR} 21 | -------------------------------------------------------------------------------- /scheduler/bin/help-delete-temporary-clusters: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: ./bin/help-delete-temporary-clusters 4 | # Delete all temporary clusters within a zone. The sibling scripts here mark clusters they create. 5 | # This is intended to be used by other scripts and not directly. 6 | # is a gcloud project. 7 | # can be a zone. E.g., us-central1-a 8 | 9 | 10 | set -e 11 | 12 | PROJECT=$1 13 | ZONE=$2 14 | GKE_CLUSTER_OWNER=${GKE_CLUSTER_OWNER:-$USER} 15 | 16 | gcloud="gcloud --project $PROJECT" 17 | 18 | # Nuke all existing temporary clusters; don't want to keep on making more idle clusters each time you invoke this. 19 | echo "---- Deleting any existing temporary clusters with owner $GKE_CLUSTER_OWNER" 20 | filter="resourceLabels.longevity=temporary AND resourceLabels.owner=$GKE_CLUSTER_OWNER" 21 | $gcloud container clusters list --filter "$filter" 22 | for i in $($gcloud container clusters list --filter "$filter" --format="value(name)") 23 | do 24 | echo "Deleting $i" 25 | $gcloud --quiet container clusters delete "$i" --zone "$ZONE" & 26 | done 27 | wait 28 | -------------------------------------------------------------------------------- /scheduler/bin/make-gke-test-cluster: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: ./bin/make-gke-test-cluster 4 | # Configure a kubernetes cluster for running pool-based integration tests and running pools in general. 5 | # NOTE: This script labels any clusters it creates and will DELETE old clusters it created. 6 | # is a gcloud project. 7 | # can be a zone. E.g., us-central1-a 8 | # is the name of a cluster. E.g., 'test-cluster-1' 9 | 10 | # Prerequesites: 11 | # - Install gcloud (https://cloud.google.com/sdk/docs/quickstarts) 12 | # - Log in: gcloud auth login 13 | # - Install kubectl: gcloud components install kubectl 14 | 15 | set -e 16 | 17 | if [ $# -eq 0 ] 18 | then 19 | echo "You must provide the GCP project to use!" 20 | exit 1 21 | fi 22 | 23 | PROJECT=$1 24 | ZONE=${2:-us-central1-a} 25 | CLUSTERNAME=${3:-$USER-test-cluster-$(date '+%Y%m%d-%H%M%S')} 26 | 27 | gcloud="gcloud --project $PROJECT" 28 | 29 | bin/help-delete-temporary-clusters "$PROJECT" "$ZONE" 30 | bin/help-make-cluster "$PROJECT" "$ZONE" "$CLUSTERNAME" .cook_kubeconfig_1 31 | 32 | echo "---- Showing all of the clusters we generated" 33 | $gcloud container clusters list 34 | -------------------------------------------------------------------------------- /scheduler/bin/make-gke-test-clusters: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: ./bin/make-gke-test-clusters [] [] [] 4 | # Configure two kubernetes clusters for running pool-based integration tests and running pools in general. 5 | # NOTE: This script labels any clusters it creates and will DELETE old clusters it created. 6 | # is a gcloud project and defaults to $GCP_PROJECT_NAME. 7 | # can be a zone. E.g., us-central1-a 8 | # is the name of a cluster. E.g., 'test-cluster-1' 9 | 10 | # Prerequesites: 11 | # - Install gcloud (https://cloud.google.com/sdk/docs/quickstarts) 12 | # - Log in: gcloud auth login 13 | # - Install kubectl: gcloud components install kubectl 14 | 15 | set -e 16 | 17 | GKE_CLUSTER_OWNER=${GKE_CLUSTER_OWNER:-$USER} 18 | PROJECT=${1:-$GCP_PROJECT_NAME} 19 | ZONE=${2:-us-central1-a} 20 | CLUSTERNAME=${3:-$GKE_CLUSTER_OWNER-test-$(date '+%m%d-%H%M%S')} 21 | 22 | gcloud="gcloud --project $PROJECT" 23 | 24 | bin/help-delete-temporary-clusters "$PROJECT" "$ZONE" 25 | rm -f .cook_kubeconfig_1 26 | rm -f .cook_kubeconfig_2 27 | 28 | # Make 2 clusters. 29 | bin/help-make-cluster "$PROJECT" "$ZONE" "${CLUSTERNAME}"-a .cook_kubeconfig_1 & 30 | bin/help-make-cluster "$PROJECT" "$ZONE" "${CLUSTERNAME}"-b .cook_kubeconfig_2 & 31 | wait 32 | 33 | echo "---- Showing all of the clusters we generated" 34 | filter="resourceLabels.longevity=temporary AND resourceLabels.owner=$GKE_CLUSTER_OWNER" 35 | $gcloud container clusters list --filter "$filter" 36 | -------------------------------------------------------------------------------- /scheduler/bin/priority-class-cook-workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | name: cook-workload 5 | value: 1000 6 | globalDefault: false 7 | description: "This priority class should be used for Cook scheduled workloads." 8 | -------------------------------------------------------------------------------- /scheduler/bin/priority-class-synthetic-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | name: synthetic-pod 5 | value: 1 6 | globalDefault: false 7 | description: "This priority class should be used for Cook synthetic pods (trigger autoscaling)." 8 | -------------------------------------------------------------------------------- /scheduler/bin/run-local-kubernetes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: ./bin/run-local-kubernetes.sh 4 | # Runs the cook scheduler locally. 5 | 6 | set -e 7 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 8 | SCHEDULER_DIR="$( dirname "${DIR}" )" 9 | 10 | # Defaults (overridable via environment) 11 | : ${COOK_DATOMIC_URI="datomic:mem://cook-jobs"} 12 | : ${COOK_FRAMEWORK_ID:=cook-framework-$(date +%s)} 13 | : ${COOK_KEYSTORE_PATH:="${SCHEDULER_DIR}/cook.p12"} 14 | : ${COOK_NREPL_PORT:=${2:-8888}} 15 | : ${COOK_PORT:=${1:-12321}} 16 | : ${COOK_SSL_PORT:=${3:-12322}} 17 | : ${MASTER_IP:="127.0.0.2"} 18 | : ${ZOOKEEPER_IP:="127.0.0.1"} 19 | : ${MESOS_NATIVE_JAVA_LIBRARY:="/usr/local/lib/libmesos.dylib"} 20 | 21 | 22 | if [ "${COOK_ZOOKEEPER_LOCAL}" = false ] ; then 23 | COOK_ZOOKEEPER="${ZOOKEEPER_IP}:2181" 24 | echo "Cook ZooKeeper configured to ${COOK_ZOOKEEPER}" 25 | else 26 | COOK_ZOOKEEPER="" 27 | COOK_ZOOKEEPER_LOCAL=true 28 | echo "Cook will use local ZooKeeper" 29 | fi 30 | 31 | if [ ! -f "${COOK_KEYSTORE_PATH}" ]; 32 | then 33 | keytool -genkeypair -keystore "${COOK_KEYSTORE_PATH}" -storetype PKCS12 -storepass cookstore -dname "CN=cook, OU=Cook Developers, O=Two Sigma Investments, L=New York, ST=New York, C=US" -keyalg RSA -keysize 2048 34 | fi 35 | 36 | echo "Creating environment variables..." 37 | export COOK_DATOMIC_URI="${COOK_DATOMIC_URI}" 38 | export COOK_FRAMEWORK_ID="${COOK_FRAMEWORK_ID}" 39 | export COOK_ONE_USER_AUTH=$(whoami) 40 | export COOK_HOSTNAME="cook-scheduler-${COOK_PORT}" 41 | export COOK_LOG_FILE="log/cook-${COOK_PORT}.log" 42 | export COOK_NREPL_PORT="${COOK_NREPL_PORT}" 43 | export COOK_PORT="${COOK_PORT}" 44 | export COOK_ZOOKEEPER="${COOK_ZOOKEEPER}" 45 | export COOK_ZOOKEEPER_LOCAL="${COOK_ZOOKEEPER_LOCAL}" 46 | export LIBPROCESS_IP="${MASTER_IP}" 47 | export MESOS_MASTER="${MASTER_IP}:5050" 48 | export MESOS_NATIVE_JAVA_LIBRARY="${MESOS_NATIVE_JAVA_LIBRARY}" 49 | export COOK_SSL_PORT="${COOK_SSL_PORT}" 50 | export COOK_KEYSTORE_PATH="${COOK_KEYSTORE_PATH}" 51 | 52 | echo "Getting GKE credentials..." 53 | filter="resourceLabels.longevity=temporary AND resourceLabels.owner=$GKE_CLUSTER_OWNER" 54 | gcloud container clusters list --filter "$filter" 55 | i=1 56 | for cluster_zone in $(gcloud container clusters list --filter "$filter" --format="csv(name,zone)" | tail -n +2) 57 | do 58 | cluster=$(echo "$cluster_zone" | cut -d',' -f1) 59 | zone=$(echo "$cluster_zone" | cut -d',' -f2) 60 | echo "Getting credentials for cluster $cluster in zone $zone ($i)" 61 | KUBECONFIG=.cook_kubeconfig_$i gcloud container clusters get-credentials "$cluster" --zone "$zone" 62 | ((i++)) 63 | done 64 | KUBECONFIG=.cook_kubeconfig_1 kubectl get pods --namespace cook 65 | KUBECONFIG=.cook_kubeconfig_2 kubectl get pods --namespace cook 66 | 67 | echo "Starting cook..." 68 | rm -f "$COOK_LOG_FILE" 69 | lein run config-k8s.edn 70 | -------------------------------------------------------------------------------- /scheduler/bin/sample_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | uuid=$(uuidgen) 3 | curl -u vagrant:password -H "content-type: application/json" -XPOST http://localhost:12321/rawscheduler -d '{"jobs": [{"max_retries": 3, "max_runtime": 86400000, "mem": 1000, "cpus": 1.5, "uuid": "'${uuid}'", "command": "echo hello my friend", "name": "test", "priority": 0}]}' 4 | printf "\n" 5 | -------------------------------------------------------------------------------- /scheduler/bin/start-datomic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euf -o pipefail 4 | 5 | PROJECT_DIR="$(dirname $0)/.." 6 | DATOMIC_VERSION="0.9.5561.56" 7 | DATOMIC_DIR="${PROJECT_DIR}/datomic/datomic-free-${DATOMIC_VERSION}" 8 | 9 | if [ ! -d "${DATOMIC_DIR}" ]; 10 | then 11 | unzip "${PROJECT_DIR}/datomic/datomic-free-${DATOMIC_VERSION}.zip" -d "${PROJECT_DIR}/datomic" 12 | fi 13 | 14 | COOK_VERSION=$(lein print :version | tr -d '"') 15 | 16 | if [ ! -f "${DATOMIC_DIR}/lib/cook-${COOK_VERSION}.jar" ]; 17 | then 18 | lein uberjar 19 | # `lein print :version` would not have worked if nothing was built, so need to 20 | # get version again after building 21 | COOK_VERSION=$(lein print :version | tr -d '"') 22 | cp "${PROJECT_DIR}/target/cook-${COOK_VERSION}.jar" "${DATOMIC_DIR}/lib/" 23 | fi 24 | 25 | "${DATOMIC_DIR}/bin/transactor" $(realpath "${PROJECT_DIR}/datomic/datomic_transactor.properties") 26 | 27 | 28 | -------------------------------------------------------------------------------- /scheduler/bin/submit-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | UUID=$(uuidgen) 4 | 5 | curl -XPOST -H"Content-Type: application/json" http://localhost:12321/rawscheduler -d"{\"jobs\": [{\"uuid\": \"$UUID\", \"env\": {\"EXECUTOR_TEST_EXIT\": \"1\"}, \"executor\": \"cook\", \"mem\": 128, \"cpus\": 1, \"command\": \"echo progress: 50 test_progress && exit 0\", \"max_retries\": 1, \"container\": {\"type\": \"DOCKER\", \"docker\": {\"image\": \"python:3.5.9-stretch\", \"network\": \"HOST\", \"force-pull-image\": false}, \"volumes\": [{\"container-path\": \"/Users/paul/src/Cook/executor/dist\", \"host-path\": \"/Users/paul/src/Cook/executor/dist\"}]}}]}" 6 | -------------------------------------------------------------------------------- /scheduler/datomic/data/seed_k8s_pools.clj: -------------------------------------------------------------------------------- 1 | (ns data.seed-k8s-pools 2 | (:require [cook.datomic :as datomic] 3 | [cook.postgres :as pg] 4 | [cook.quota :as quota] 5 | [datomic.api :as d])) 6 | 7 | (def uri (second *command-line-args*)) 8 | (println "Datomic URI is" uri) 9 | 10 | (defn create-pool 11 | [conn name state] 12 | (println "Creating pool" name) 13 | @(d/transact conn [{:db/id (d/tempid :db.part/user) 14 | :pool/name name 15 | :pool/purpose "This is a pool for testing purposes" 16 | :pool/state state 17 | :pool/dru-mode :pool.dru-mode/default}])) 18 | 19 | (defn pools 20 | [db] 21 | (->> (d/q '[:find [?p ...] 22 | :in $ [?state ...] 23 | :where 24 | [?p :pool/state ?state]] 25 | db [:pool.state/active :pool.state/inactive]) 26 | (map (partial d/entity db)) 27 | (map d/touch))) 28 | 29 | (try 30 | (let [conn (datomic/create-connection {:settings {:mesos-datomic-uri uri}})] 31 | (->> (System/getenv "COOK_DB_TEST_PG_SCHEMA") 32 | (pg/make-database-connection-dictionary-from-env-vars) 33 | (reset! pg/saved-pg-config-dictionary)) 34 | (println "Connected to Datomic:" conn) 35 | (create-pool conn "k8s-alpha" :pool.state/active) 36 | (create-pool conn "k8s-beta" :pool.state/inactive) 37 | (create-pool conn "k8s-gamma" :pool.state/active) 38 | (create-pool conn "k8s-delta" :pool.state/inactive) 39 | (create-pool conn "k8s-quota" :pool.state/active) 40 | (quota/set-quota! conn "default" "k8s-alpha" "For quota-related testing." :cpus 8 :mem 1024) 41 | (quota/set-quota! conn "default" "k8s-gamma" "For quota-related testing." :cpus 9 :mem 2048) 42 | (println "Pools & Quotas:") 43 | (run! (fn [{:keys [pool/name] :as p}] 44 | (clojure.pprint/pprint p) 45 | (clojure.pprint/pprint (quota/get-quota (d/db conn) "default" name))) 46 | (pools (d/db conn))) 47 | (System/exit 0)) 48 | (catch Throwable t 49 | (println "Failed to seed pools:" t) 50 | (System/exit 1))) 51 | -------------------------------------------------------------------------------- /scheduler/datomic/datomic-free-0.9.5561.56.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/scheduler/datomic/datomic-free-0.9.5561.56.zip -------------------------------------------------------------------------------- /scheduler/datomic/datomic_transactor.properties: -------------------------------------------------------------------------------- 1 | protocol=free 2 | host=0.0.0.0 3 | port=4334 4 | 5 | memory-index-threshold=32m 6 | memory-index-max=256m 7 | object-cache-max=128m 8 | -------------------------------------------------------------------------------- /scheduler/dev-config.edn: -------------------------------------------------------------------------------- 1 | {:port 12321 2 | :hostname "localhost" 3 | ;; We'll set the user to vagrant, since that's the default for many Vagrant-based Mesos setups 4 | :authorization {:one-user "vagrant"} 5 | :database {:datomic-uri "datomic:mem://cook-jobs"} 6 | :zookeeper {:local? true 7 | ;:local-port 3291 ; Uncomment to change the default port 8 | } 9 | :scheduler {:offer-incubate-ms 15000 10 | :mea-culpa-failure-limit {:default 5 11 | :mesos-master-disconnected 8 12 | ; -1 means no limit 13 | :preempted-by-rebalancer -1} 14 | :task-constraints {:timeout-hours 1 15 | :timeout-interval-minutes 1 16 | :memory-gb 48 17 | :retry-limit 15 18 | :cpus 6}} 19 | :rebalancer {:dru-scale 1} 20 | :mesos {:master "zk://localhost:2181/mesos" ; Assuming Mesos is configured to use Zookeeper and is running locally 21 | :failover-timeout-ms nil ; When we close the instance of Cook, all its tasks are killed by Mesos 22 | :leader-path "/cook-scheduler"} 23 | :unhandled-exceptions {:log-level :error} 24 | :metrics {:jmx true} 25 | :nrepl {:enabled? true 26 | :port 8888} 27 | :log {:file "log/cook.log" 28 | :levels {"datomic.db" :warn 29 | "datomic.peer" :warn 30 | "datomic.kv-cluster" :warn 31 | :default :info}}} 32 | -------------------------------------------------------------------------------- /scheduler/docker/run-cook.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATOMIC_PROPERTIES_FILE=/opt/cook/datomic/datomic_transactor.properties 4 | 5 | echo "alt-host=$(hostname -i | cut -d' ' -f2)" >> ${DATOMIC_PROPERTIES_FILE} 6 | /opt/cook/datomic-free-0.9.5561.56/bin/transactor ${DATOMIC_PROPERTIES_FILE} & 7 | echo "Seeding test data..." 8 | # Needed because seeding pools uses codepaths that access the database. 9 | export COOK_DB_TEST_PG_DB="cook_local" 10 | export COOK_DB_TEST_PG_USER="cook_scheduler" 11 | export COOK_DB_TEST_PG_SERVER="cook-postgres" 12 | export COOK_DB_TEST_PG_SCHEMA="cook_local" 13 | lein exec -p /opt/cook/datomic/data/seed_k8s_pools.clj ${COOK_DATOMIC_URI} 14 | lein exec -p /opt/cook/datomic/data/seed_running_jobs.clj ${COOK_DATOMIC_URI} 15 | lein with-profiles +docker run $1 16 | -------------------------------------------------------------------------------- /scheduler/docs/clj-http-async-pool.md: -------------------------------------------------------------------------------- 1 | clj-http-async-pool 2 | =================== 3 | 4 | pooling middleware for async clj-http requests 5 | 6 | Usage 7 | ----- 8 | 9 | (use '[clj-http-async-pool.router :as http-router]) 10 | (def router (http-router/make-router {:hosts #{"www.random.org:80"}})) 11 | (use '[clj-http-async-pool.client :as http]) 12 | (http/get router "https://www.random.org/sequences/?min=1&max=42&col=1&format=plain") 13 | 14 | © Two Sigma Open Source, LLC 15 | -------------------------------------------------------------------------------- /scheduler/docs/dev-getting-started.md: -------------------------------------------------------------------------------- 1 | # Setting up your Cook dev environment 2 | 3 | This document tells you how to set up a Cook dev environment from 4 | scratch. We have to install Clojure itself, Datomic, Docker, and Mesos. 5 | 6 | Prerequisites 7 | ============= 8 | 9 | Before beginning, you should already have working installations of Clojure and [Leiningen](https://leiningen.org/). 10 | Refer to those projects' getting started guides for information on how to set 11 | them up. 12 | 13 | 14 | Installing Cook-specific Infrastructure 15 | ======================================== 16 | 17 | 18 | Docker 19 | ----- 20 | 21 | Install docker by following the instructions on: 22 | 23 | https://docs.docker.com/engine/installation/linux/ubuntulinux/ 24 | 25 | There is install docs for all common OSs. 26 | 27 | Minimesos 28 | ----- 29 | 30 | Then, install minimesos by following the instructions at 31 | http://minimesos.readthedocs.io/en/latest/ . 32 | 33 | 34 | Once minimesos is installed, you can download and run minimesos itself: 35 | 36 | 37 | ``` 38 | mkdir minimesos 39 | cd minimesos 40 | 41 | minimesos init 42 | minimesos up --num-agents 2 43 | ``` 44 | 45 | Big Mesos 46 | --------- 47 | 48 | Even if you are using minimesos, you still have to build regular Mesos 49 | to get the `libmesos.so` library (called `libmesos.dylib` on Mac). 50 | 51 | You can either install it directly on your machine or use a docker container with 52 | mesos installed. Here we will only talk about using a docker container. 53 | If you instead want to install mesos on your machine, you can follow the docs here: 54 | http://mesos.apache.org/gettingstarted/ 55 | 56 | The following repo contains a DockerFile that will set up Cook, 57 | use this as a starting point to get set up: 58 | https://github.com/wyegelwel/cook-docker 59 | 60 | 61 | Command Line Usage 62 | ================== 63 | 64 | To build and run the project at the command line, copy 65 | `$COOK_DIR/scheduler/dev-config.edn` and edit the copy so that the Mesos master ZooKeeper URL matches 66 | the one returned by `minimesos info`. 67 | 68 | Then run the following, replacing `$MESOS_DIR` with the actual path to your local 69 | Mesos build: 70 | 71 | 72 | ``` 73 | cd $COOK_DIR/scheduler 74 | lein uberjar 75 | MESOS_NATIVE_JAVA_LIBRARY=$MESOS_DIR/build/src/.libs/libmesos.so lein run ./local-dev-config.edn 76 | ``` 77 | 78 | Test that the server is running properly with: 79 | 80 | ``` 81 | curl http://localhost:12321/rawscheduler 82 | ``` 83 | 84 | If you get a reply like `"must supply at least one job query param"`, that means Cook is running. 85 | 86 | 87 | Interactive development 88 | ======================= 89 | 90 | The dev config will open a nrepl port on the running cook server. 91 | You can connect to this port and then develop, eval and test on the running server. 92 | We have found this greatly speeds up development and is just generally pleasant. 93 | -------------------------------------------------------------------------------- /scheduler/docs/kubernetes-state.dot: -------------------------------------------------------------------------------- 1 | # A graph showing valid transitions from expected states to expected states. Edges are annotated with 2 | # which kubernetes states the system may be in when it makes the transition. 3 | 4 | digraph g { 5 | Starting -> Starting [label=":waiting\n:missing"] 6 | Starting -> Running [label=":running"] 7 | // (Starting, missing) -> Completed happens for some failed pod submissions 8 | Starting -> Completed [label=":succeeded\n:failed\n:unknown\n:deleting"] 9 | 10 | Running -> Running [label=":running"] 11 | Running -> Completed [label=":waiting\n:succeeded\n:failed\n:unknown\n:missing\n:deleting"] 12 | 13 | Completed -> Completed [label=":waiting\n:running\n:unknown\n:succeeded\n:failed"] 14 | Completed -> Missing [label=":missing\n:deleting"] 15 | 16 | Killed -> Killed [label=":waiting\n:running\n:unknown\n"] 17 | Killed -> Completed [label=":succeeded\n:failed\n:missing\n:deleting"] 18 | 19 | Missing [peripheries=2] 20 | Missing -> Missing [label=":waiting\n:running\n:succeeded\n:failed\n:unknown\n:missing\n:deleting"] 21 | } 22 | -------------------------------------------------------------------------------- /scheduler/docs/make-kubernetes-namespace.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "v1", 3 | "kind": "Namespace", 4 | "metadata": { 5 | "name": "cook", 6 | "labels": { 7 | "name": "cook" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /scheduler/docs/metatransactions.md: -------------------------------------------------------------------------------- 1 | # metatransaction 2 | 3 | ## What is a metatransaction? 4 | 5 | A metatransaction is a way to link datomic transactions in a single logical transaction. Metatransactions have simple symantics, link a datomic transaction to a metatransaction and commit a metatransaction. The library also supplies a [db filter](https://support.cognitect.com/entries/25976096-Filtering-Databases) to remove transactions that have not had their metatransaction committed. 6 | 7 | ## Why use metatransactions? 8 | 9 | Metatransactions allow you to craft smaller transactions and handle streaming data while maintaining the semantics of a transaction. 10 | 11 | ## Usage 12 | 13 | Simple example where a user can submit any number of jobs, one at a time to the server and once all the jobs are sent, a end message is sent to the server. Here we can use metatransactions to logically transact our jobs and only acknowledge them once they are committed.: 14 | 15 | ```Clojure 16 | 17 | (require '[datomic.api :as d] 18 | '[metatransaction.core :as mt]) 19 | 20 | (def conn (d/connect uri)) 21 | 22 | (mt/setup-metatransaction conn) 23 | 24 | (def job1 (d/squuid)) ; We suggest using squuids to improve indexing 25 | 26 | (d/transact conn [[:metatransaction/include-in job1] 27 | {:db/id (d/tempid :db.part/user) 28 | :job/id 1 29 | :job/uuid job1}]) 30 | (d/transact conn [[:metatransaction/commit job1]]) 31 | 32 | (def job2 (d/squuid)) 33 | 34 | (d/transact conn [[:metatransaction/include-in job2] 35 | {:db/id (d/tempid :db.part/user) 36 | :job/id 1 37 | :job/uuid job2}]) 38 | (d/transact conn [[:metatransaction/include-in job2] 39 | {:db/id (d/tempid :db.part/user) 40 | :job/id 2 41 | :job/uuid job2}]) 42 | 43 | (d/q '[:find ?job-id ?job-uuid 44 | :where 45 | [?e :job/id ?job-id] 46 | [?e :job/uuid ?job-uuid]] 47 | (d/db conn)) 48 | ;; Will print #{[1 job1] [1 job2] [2 job2]} 49 | 50 | 51 | 52 | (d/q '[:find ?job-id ?job-uuid 53 | :where 54 | [?e :job/id ?job-id] 55 | [?e :job/uuid ?job-uuid]] 56 | (mt/db conn)) 57 | 58 | ; Will print #{[1 job1]} 59 | ``` 60 | 61 | ## License 62 | 63 | © Two Sigma Open Source, LLC 64 | -------------------------------------------------------------------------------- /scheduler/docs/optimizer.md: -------------------------------------------------------------------------------- 1 | Optimizer 2 | ========= 3 | 4 | The optimizer is intended to provide a longer term, holistic plan for the cluster that other components in Cook can consume to inform their operation. 5 | Cook will provide a no-op implementation of an optimizer and allow for plugging in different implementations. 6 | 7 | The optimizer is provided with the current queue, the jobs that are running, the offers that are available and a pluggable feed of hosts that can be purchased. 8 | There are plans to support more plug-ins such as expected demand in the future. 9 | With these inputs, the optimizer produces a 'schedule' of suggestions of what hosts to purchase and matches of jobs and hosts at different time horizons. 10 | 11 | There are plans to have the schedule be fed to the matcher so that it may treat the suggestions of the optimizer as soft constraints. 12 | 13 | The specification of pluggable pieces can be found in [optimizer.clj](scheduler/src/cook/mesos/optimizer.clj). 14 | -------------------------------------------------------------------------------- /scheduler/docs/reason-code: -------------------------------------------------------------------------------- 1 | 01xxx: Normal 2 | 01000: Normal exit 3 | 01001: Killed by user 4 | 01002: Preempted by rebalancer 5 | 01003: REASON_CONTAINER_PREEMPTED 6 | 01004: REASON_TASK_KILLED_DURING_LAUNCH 7 | 01005: Running 8 | 01006: Scheduling failed on host 9 | 01007: Container initialization timed out 10 | 01008: Killed externally 11 | 01009: Container readiness timed out 12 | 01010: Kubernetes pod submission API error 13 | 14 | 02xxx: Job Misconfiguration 15 | 02000: REASON_CONTAINER_LIMITATION 16 | 02001: REASON_CONTAINER_LIMITATION_DISK 17 | 02002: REASON_CONTAINER_LIMITATION_MEMORY 18 | 02003: Max runtime exceeded 19 | 02004: Task was a straggler 20 | 21 | 03xxx: Cook Error 22 | 03000: REASON_RECONCILIATION 23 | 03001: REASON_INVALID_FRAMEWORKID 24 | 03002: REASON_INVALID_OFFERS 25 | 03003: REASON_RESOURCES_UNKNOWN 26 | 03004: REASON_TASK_INVALID 27 | 03005: REASON_TASK_UNAUTHORIZED 28 | 03006: REASON_TASK_UNKNOWN 29 | 03007: REASON_SLAVE_UNKNOWN 30 | 03008: Could not reconstruct pod 31 | 32 | 04xxx: Mesos Slave Error 33 | 04000: REASON_SLAVE_REMOVED 34 | 04001: REASON_SLAVE_RESTARTED 35 | 04002: REASON_GC_ERROR 36 | 04003: REASON_CONTAINER_LAUNCH_FAILED 37 | 04004: REASON_CONTAINER_UPDATE_FAILED 38 | 04005: REASON_SLAVE_DISCONNECTED 39 | 04006: Cook heartbeat lost 40 | 41 | 05xxx: Mesos Master Error 42 | 05000: REASON_FRAMEWORK_REMOVED 43 | 05001: REASON_MASTER_DISCONNECTED 44 | 45 | 06xxx: Executor Error 46 | 06000: REASON_EXECUTOR_REGISTRATION_TIMEOUT 47 | 06001: REASON_EXECUTOR_REREGISTRATION_TIMEOUT 48 | 06002: REASON_EXECUTOR_UNREGISTERED 49 | 50 | 99xxx: General Error 51 | 99000: unknown reason 52 | 99001: unknown mesos reason 53 | 99002: REASON_EXECUTOR_TERMINATED 54 | 99003: Exited non-zero 55 | -------------------------------------------------------------------------------- /scheduler/example-prod-config.edn: -------------------------------------------------------------------------------- 1 | {:port 12321 2 | :hostname "cook.example.com" 3 | :authorization {:http-basic true} 4 | :database {:datomic-uri "datomic:free://example.com:4334/cook-jobs"} 5 | :authorization-config { 6 | ;; What function should be used to perform user authorization? 7 | ;; See the docstring in cook.rest.authorization for details. 8 | :authorization-fn cook.rest.authorization/configfile-admins-auth 9 | 10 | 11 | ;; These users have admin privileges when using 12 | ;; configfile-admins-auth -- that is, they can view and modify other 13 | ;; users' jobs. 14 | :admins #{"admin" "other-admin"}} 15 | :zookeeper {:connection "zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181/cook"} 16 | :scheduler {:offer-incubate-ms 15000 17 | :mea-culpa-failure-limit 5 18 | :task-constraints {:timeout-hours 24 19 | :timeout-interval-minutes 10 20 | :memory-gb 96 21 | :retry-limit 15 22 | :cpus 20}} 23 | :executor {:command "./cook-executor" 24 | :uri {:cache true 25 | :executable true 26 | :extract false 27 | :value "http://example.com:12321/resources/cook-executor"}} 28 | :mesos {:master "zk://zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181/cook" 29 | :failover-timeout-ms 1209600000 30 | :leader-path "/cook-scheduler"} 31 | :unhandled-exceptions {:log-level :error 32 | :email {:to ["admin@example.com"] 33 | :from "cook@example.com" 34 | :subject "Unhandled exception in cook"}} 35 | :metrics {:jmx true} 36 | :nrepl {:enabled? true 37 | :port 8888} 38 | :log {:file "log/cook.log" 39 | :levels {"datomic.db" :warn 40 | "datomic.peer" :warn 41 | "datomic.kv-cluster" :warn 42 | :default :info}}} 43 | -------------------------------------------------------------------------------- /scheduler/java/com/twosigma/cook/kubernetes/FinalizerHelper.java: -------------------------------------------------------------------------------- 1 | package com.twosigma.cook.kubernetes; 2 | 3 | import io.kubernetes.client.custom.V1Patch; 4 | import io.kubernetes.client.openapi.ApiClient; 5 | import io.kubernetes.client.openapi.ApiException; 6 | import io.kubernetes.client.openapi.apis.CoreV1Api; 7 | import io.kubernetes.client.openapi.models.V1Pod; 8 | import io.kubernetes.client.util.PatchUtils; 9 | import org.joda.time.DateTime; 10 | 11 | import java.util.List; 12 | 13 | public class FinalizerHelper { 14 | /** A finalizer that is attached to a pod to ensure that it is not GC'ed by K8s before cook 15 | * has had a chance to collect the completion result (success or failed) */ 16 | static public final String collectResultsFinalizer = "cook/prevent-pod-gc"; 17 | 18 | /** Remove the collectResultsFinalizer from a pod if it exists on a pod and the pod is morked for 19 | * deletion. */ 20 | static public void removeFinalizer(ApiClient apiClient, V1Pod pod) throws ApiException { 21 | CoreV1Api api = new CoreV1Api(apiClient); 22 | 23 | DateTime deletionTimestamp = pod.getMetadata().getDeletionTimestamp(); 24 | if (deletionTimestamp != null) { 25 | List finalizers = pod.getMetadata().getFinalizers(); 26 | if (finalizers != null) { 27 | for (int ii = 0; ii < finalizers.size(); ii++) { 28 | if (collectResultsFinalizer.equals(finalizers.get(ii))) { 29 | String jsonPatchStr = "[{\"op\": \"remove\", \"path\": \"/metadata/finalizers/" + ii + "\"}]"; 30 | String podName = pod.getMetadata().getName(); 31 | String namespaceName = pod.getMetadata().getNamespace(); 32 | PatchUtils.patch( 33 | V1Pod.class, 34 | () -> 35 | api.patchNamespacedPodCall( 36 | podName, 37 | namespaceName, 38 | new V1Patch(jsonPatchStr), 39 | null, 40 | null, 41 | null, // field-manager is optional 42 | null, 43 | null), 44 | V1Patch.PATCH_FORMAT_JSON_PATCH, 45 | apiClient); 46 | return; // Early abort if we've found the finalizer. 47 | } 48 | } 49 | } 50 | } 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /scheduler/java/com/twosigma/cook/kubernetes/WatchHelper.java: -------------------------------------------------------------------------------- 1 | package com.twosigma.cook.kubernetes; 2 | 3 | import com.google.common.reflect.TypeToken; 4 | import io.kubernetes.client.openapi.ApiClient; 5 | import io.kubernetes.client.openapi.ApiException; 6 | import io.kubernetes.client.openapi.apis.CoreV1Api; 7 | import io.kubernetes.client.openapi.models.CoreV1Event; 8 | import io.kubernetes.client.openapi.models.V1Node; 9 | import io.kubernetes.client.openapi.models.V1Pod; 10 | import io.kubernetes.client.util.Watch; 11 | 12 | public class WatchHelper { 13 | 14 | public static Watch createPodWatch(ApiClient apiClient, String resourceVersion) throws ApiException { 15 | CoreV1Api api = new CoreV1Api(apiClient); 16 | return Watch.createWatch(apiClient, 17 | api.listPodForAllNamespacesCall(null, null, null, null, null, null, 18 | resourceVersion, null, null, true, null), 19 | new TypeToken>() {}.getType()); 20 | } 21 | 22 | public static Watch createNodeWatch(ApiClient apiClient, String resourceVersion) throws ApiException { 23 | CoreV1Api api = new CoreV1Api(apiClient); 24 | return Watch.createWatch(apiClient, 25 | api.listNodeCall(null, null, null, null, null, null, resourceVersion, null, null, true, null), 26 | new TypeToken>() {}.getType()); 27 | } 28 | 29 | public static Watch createEventWatch(ApiClient apiClient, String resourceVersion) throws ApiException { 30 | CoreV1Api api = new CoreV1Api(apiClient); 31 | return Watch.createWatch(apiClient, 32 | api.listEventForAllNamespacesCall(null, null, null, null, null, 33 | null, resourceVersion, null, null, true, null), 34 | new TypeToken>() {}.getType()); 35 | } 36 | } -------------------------------------------------------------------------------- /scheduler/liquibase/changelog/com/twosigma/cook/changelogs/setup.postgresql.sql: -------------------------------------------------------------------------------- 1 | --liquibase formatted sql 2 | 3 | -- Initialize a cook database from scratch --- creating the schemas and such. 4 | -- Assumes we already have an appropriately configured postgresql database and 5 | -- have psql connected to it. 6 | 7 | -- If you get a crazy error where 'No schema has been selected to 8 | -- create in' when running the first CREATE TABLE. It can be caused by there being a capital 9 | -- letter in cook_schema. Schema names are lowercased when created, but case-sensitive when 10 | -- in the search path. -------------------------------------------------------------------------------- /scheduler/postgresql/bin/make-launch-postgres-docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### 4 | ### Reset any existing postgres docker container and make and configure one afresh. 5 | ### 6 | ### Sets the password to $PGPASSWORD 7 | ### 8 | 9 | if [[ a"$PGPASSWORD" == a ]]; 10 | then 11 | echo "Need to set PGPASSWORD." 12 | exit 1 13 | fi 14 | 15 | 16 | ## Copied from run-docker.sh 17 | echo "About to: Setup and check docker networking" 18 | if [ -z "$(docker network ls -q -f name=cook_nw)" ]; 19 | then 20 | # Using a separate network allows us to access hosts by name (cook-scheduler-12321) 21 | # instead of IP address which simplifies configuration 22 | echo "Creating cook_nw network" 23 | docker network create -d bridge --subnet 172.25.0.0/16 cook_nw 24 | fi 25 | 26 | 27 | echo "#### Flushing existing docker containers `date`" 28 | 29 | # Flush any existing containers. 30 | docker kill cook-postgres || true 31 | docker container rm cook-postgres || true 32 | 33 | echo "#### Launching database `date`" 34 | 35 | # This launches the database. We give it a hostname of cook-postgres so that we can connect to 36 | # the container using psql -h ...., later. 37 | docker run --name cook-postgres --hostname cook-postgres --publish=5432:5432 --rm --network cook_nw -e POSTGRES_PASSWORD="${PGPASSWORD}" -d postgres:13 38 | 39 | echo "#### Pausing for the DB to restart before setting it up." 40 | sleep 4 41 | 42 | export COOK_SCHEMA=cook_local 43 | 44 | # 45 | # Finish postgres setup in the container. 46 | # 47 | 48 | # Create the initial cook account and database. 49 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" 50 | ${DIR}/setup-database.sh 51 | 52 | # See the README.txt to see how to access this interactively. 53 | -------------------------------------------------------------------------------- /scheduler/postgresql/bin/setup-database.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" 4 | 5 | # Create the initial cook account and database. 6 | echo "#### Initializing new account and database." 7 | psql --set=cook_user_password="$PGPASSWORD" -h 127.0.0.1 -U postgres -f ${DIR}/../sql/docker_init_new_database.sql 8 | 9 | echo "#### Running script to create convenience SQL schema cook_local" 10 | export COOK_DB_TEST_PG_DATABASE=cook_local 11 | export COOK_DB_TEST_PG_USER=cook_scheduler 12 | export COOK_DB_TEST_PG_SERVER=cook-postgres 13 | ${DIR}/setup-new-schema.sh ${COOK_SCHEMA} 14 | 15 | echo "#### Setting up rows for opensource integration tests." 16 | psql --set=cook_schema="${COOK_SCHEMA}" -h 127.0.0.1 -U cook_scheduler -d cook_local -f ${DIR}/../sql/insert_rows_for_opensource_integration_tests.sql 17 | 18 | -------------------------------------------------------------------------------- /scheduler/postgresql/bin/setup-new-schema.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" 4 | 5 | COOK_SCHEMA=${1} 6 | 7 | echo "### Started script to create schema '${COOK_SCHEMA}' out of directory $DIR/../sql" 8 | 9 | # Liquibase setup: 10 | echo "## Running PSQL to create schema" 11 | psql --set=cook_schema="${COOK_SCHEMA}" -h 127.0.0.1 -U cook_scheduler -d cook_local -f ${DIR}/../sql/init_cook_database.sql 12 | 13 | echo "## Liquibase setup." 14 | LIQUIBASE="${DIR}/../../liquibase" 15 | 16 | export COOK_DB_TEST_PG_DATABASE=cook_local 17 | export COOK_DB_TEST_PG_USER=cook_scheduler 18 | export COOK_DB_TEST_PG_SERVER=cook-postgres 19 | 20 | PG_JDBC_URL="jdbc:postgresql://${COOK_DB_TEST_PG_SERVER}/${COOK_DB_TEST_PG_DATABASE}?user=${COOK_DB_TEST_PG_USER}&password=${PGPASSWORD}¤tSchema=${COOK_SCHEMA}" 21 | 22 | # Note that --changeLogFile is relative to /liquibase in the container, so comes from the -v volume mountpoint, and MUST be a relative path. 23 | docker run --network cook_nw --rm -v ${LIQUIBASE}/changelog:/liquibase/changelog liquibase/liquibase:4.6 --changeLogFile=./changelog/com/twosigma/cook/changelogs/setup.postgresql.sql --url ${PG_JDBC_URL} --liquibase-schema-name=${COOK_SCHEMA} update 24 | 25 | echo "### Finished script creating schema ${COOK_SCHEMA}" 26 | -------------------------------------------------------------------------------- /scheduler/postgresql/bin/vagrant-setup-database.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" 4 | 5 | # Create the initial cook account and database. 6 | echo "#### Initializing new account and database." 7 | sudo -u postgres psql --set=cook_user_password="$PGPASSWORD" -f ${DIR}/../sql/docker_init_new_database.sql 8 | 9 | echo "#### Running script to create convenience SQL schema cook_local" 10 | ${DIR}/vagrant-setup-new-schema.sh ${COOK_SCHEMA} 11 | 12 | echo "#### Setting up rows for opensource integration tests." 13 | psql --set=cook_schema="${COOK_SCHEMA}" -h 127.0.0.1 -U cook_scheduler -d cook_local -f ${DIR}/../sql/insert_rows_for_opensource_integration_tests.sql 14 | 15 | -------------------------------------------------------------------------------- /scheduler/postgresql/bin/vagrant-setup-new-schema.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" 4 | 5 | COOK_SCHEMA=${1} 6 | 7 | echo "### Started script to create schema '${COOK_SCHEMA}' out of directory $DIR/../sql" 8 | 9 | # Liquibase setup: 10 | echo "## Running PSQL to create schema" 11 | psql --set=cook_schema="${COOK_SCHEMA}" -h 127.0.0.1 -U cook_scheduler -d cook_local -f ${DIR}/../sql/init_cook_database.sql 12 | 13 | echo "## Liquibase setup." 14 | 15 | PG_JDBC_URL="jdbc:postgresql://${COOK_DB_TEST_PG_SERVER}/${COOK_DB_TEST_PG_DATABASE}?user=${COOK_DB_TEST_PG_USER}&password=${PGPASSWORD}¤tSchema=${COOK_SCHEMA}" 16 | 17 | # Note that liquibase must run from scheduler/liquibase and --changeLogFile is relative to scheduler/liquibase and MUST be a relative path. 18 | cd ${DIR}/../../liquibase 19 | liquibase --classpath=/usr/share/java/postgresql.jar --changeLogFile=changelog/com/twosigma/cook/changelogs/setup.postgresql.sql --url ${PG_JDBC_URL} --liquibaseSchemaName=${COOK_SCHEMA} update 20 | 21 | echo "### Finished script creating schema ${COOK_SCHEMA}" 22 | -------------------------------------------------------------------------------- /scheduler/postgresql/sql/docker_init_new_database.sql: -------------------------------------------------------------------------------- 1 | -- Create a cook scheduler database from a branch new database, including 2 | -- creating the initial user. Intended to be run for docker setup in opensource only. 3 | 4 | -- When in docker-land, we use the database username cook_scheduler 5 | DROP DATABASE IF EXISTS cook_local; 6 | ALTER DEFAULT PRIVILEGES REVOKE ALL ON TABLES FROM cook_scheduler; 7 | ALTER DEFAULT PRIVILEGES REVOKE ALL ON SCHEMAS FROM cook_scheduler; 8 | DROP ROLE IF EXISTS cook_scheduler; 9 | CREATE ROLE cook_scheduler with password :'cook_user_password' LOGIN; 10 | CREATE DATABASE cook_local WITH owner cook_scheduler; 11 | 12 | -- Ensure that all schemas on this database are writeable by cook_scheduler user. 13 | ALTER DEFAULT PRIVILEGES GRANT ALL ON SCHEMAS TO cook_scheduler; 14 | ALTER DEFAULT PRIVILEGES GRANT ALL ON TABLES TO cook_scheduler; 15 | -------------------------------------------------------------------------------- /scheduler/postgresql/sql/init_cook_database.sql: -------------------------------------------------------------------------------- 1 | -- Initialize a cook database from scratch --- creating the schemas and such. 2 | -- Assumes we already have an appropriately configured postgresql database and 3 | -- have psql connected to it. 4 | 5 | BEGIN TRANSACTION; 6 | -- Always run this in the transaction so that if the set schema fails for any reason, we abort instead of possibly writing to the wrong schema's tables. 7 | CREATE SCHEMA :cook_schema; 8 | SET SCHEMA :'cook_schema'; 9 | 10 | COMMIT 11 | 12 | -- Just show the tables at the end. 13 | \dt :'cook_schema'. 14 | -------------------------------------------------------------------------------- /scheduler/postgresql/sql/insert_rows_for_opensource_integration_tests.sql: -------------------------------------------------------------------------------- 1 | -- Insert some rows for development and running tests in open source, including initial quotas and pools 2 | -- for the integration tests. 3 | 4 | --- DO NOT RUN IN PRODUCTION. 5 | begin transaction; 6 | -- Always run this in the transaction so that if the set schema fails for any reason, we abort instead of possibly writing to the wrong schema's tables. 7 | SET SCHEMA :'cook_schema'; 8 | 9 | 10 | insert into pools VALUES ('k8s-alpha',true,''); 11 | insert into pools VALUES ('k8s-beta',false,''); 12 | insert into pools VALUES ('k8s-gamma',true,''); 13 | insert into pools VALUES ('k8s-delta',false,''); 14 | 15 | insert into resource_limits VALUES ('quota','k8s-alpha','default','mem',1000000, ''); 16 | insert into resource_limits VALUES ('quota','k8s-alpha','default','cpus',1000000, ''); 17 | insert into resource_limits VALUES ('quota','k8s-beta','default','mem',1000000, ''); 18 | insert into resource_limits VALUES ('quota','k8s-beta','default','cpus',1000000, ''); 19 | end transaction; 20 | -------------------------------------------------------------------------------- /scheduler/postgresql/sql/reset_cook_database.sql: -------------------------------------------------------------------------------- 1 | -- WIPE THE DATABASE! 2 | -- Assumes we already have an appropriately configured postgresql database and 3 | -- have psql connected to it. 4 | 5 | -- Drops the schema of a cook database so it can be recreated. 6 | DROP SCHEMA IF EXISTS :cook_schema CASCADE; 7 | -------------------------------------------------------------------------------- /scheduler/postgresql/sql/reset_init_cook_database.sql: -------------------------------------------------------------------------------- 1 | -- WIPES THE DATABASE 2 | -- Reinitialize a cook database, wiping all of the contents first. 3 | -- Assumes we already have an appropriately configured postgresql database and 4 | -- have psql connected to it. 5 | 6 | \ir reset_cook_database.sql 7 | \ir init_cook_database.sql 8 | -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg* 2 | *.egg-info 3 | *.ipynb_checkpoints* 4 | *pyc 5 | __pycache__ 6 | venv/ 7 | -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/README.md: -------------------------------------------------------------------------------- 1 | # Cook analysis 2 | 3 | Tools to analyze a trace of tasks run in Cook. 4 | 5 | The python notebook included here provides samples for how to use the functions. 6 | 7 | 8 | ## Credits 9 | 10 | This package was created with (Cookiecutter)[https://github.com/audreyr/cookiecutter] and the (audreyr/cookiecutter-pypackage)[https://github.com/audreyr/cookiecutter-pypackage] project template. 11 | -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip==8.1.2 2 | bumpversion==0.5.3 3 | wheel==0.29.0 4 | watchdog==0.8.3 5 | flake8==2.6.0 6 | coverage==4.1 7 | Sphinx==1.4.8 8 | cryptography==1.7 9 | PyYAML==5.1 10 | pandas>=0.19.2 11 | matplotlib==2.0.0 12 | numpy==1.12.1 13 | -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:cook_integration/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [nosetests] 21 | processes=10 22 | process-timeout=900 -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup 5 | 6 | requirements = [ 7 | 'pandas', 8 | 'matplotlib', 9 | 'numpy' 10 | ] 11 | 12 | test_requirements = [] 13 | 14 | setup( 15 | name='cook_analysis', 16 | version='0.1.0', 17 | description="Functions to analyze trace output from cook scheduler", 18 | url='https://github.com/twosigma/Cook', 19 | include_package_data=True, 20 | install_requires=requirements, 21 | license="Apache Software License 2.0", 22 | zip_safe=False, 23 | keywords='cook_analysis', 24 | classifiers=[ 25 | 'Development Status :: 2 - Pre-Alpha', 26 | 'Intended Audience :: Developers', 27 | 'License :: OSI Approved :: Apache Software License', 28 | 'Natural Language :: English', 29 | "Programming Language :: Python :: 2", 30 | 'Programming Language :: Python :: 2.6', 31 | 'Programming Language :: Python :: 2.7', 32 | 'Programming Language :: Python :: 3', 33 | 'Programming Language :: Python :: 3.3', 34 | 'Programming Language :: Python :: 3.4', 35 | 'Programming Language :: Python :: 3.5', 36 | ], 37 | test_suite='tests', 38 | tests_require=test_requirements, 39 | setup_requires=[] 40 | ) 41 | -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /scheduler/simulator_files/analysis/tests/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG) 4 | -------------------------------------------------------------------------------- /scheduler/simulator_files/example-config.edn: -------------------------------------------------------------------------------- 1 | {:shares [{:user "default" :mem 60000.0 :cpus 600.0 :gpus 1.0}] 2 | :cycle-step-ms 30000 3 | :scheduler-config {:rebalancer-config {:max-preemption 10.0} 4 | :fenzo-config {:fenzo-max-jobs-considered 200}}} 5 | -------------------------------------------------------------------------------- /scheduler/simulator_files/example-hosts.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "hostname" : "0", 3 | "attributes" : { }, 4 | "resources" : { 5 | "cpus" : { 6 | "*" : 10 7 | }, 8 | "mem" : { 9 | "*" : 10000 10 | }, 11 | "ports" : { 12 | "*" : [ { 13 | "begin" : 1, 14 | "end" : 100 15 | } ] 16 | } 17 | }, 18 | "slave-id" : "a05a4ac2-7eb2-40a1-8259-111b961874c1" 19 | }, { 20 | "hostname" : "1", 21 | "attributes" : { }, 22 | "resources" : { 23 | "cpus" : { 24 | "*" : 10 25 | }, 26 | "mem" : { 27 | "*" : 10000 28 | }, 29 | "ports" : { 30 | "*" : [ { 31 | "begin" : 1, 32 | "end" : 100 33 | } ] 34 | } 35 | }, 36 | "slave-id" : "eba83456-8596-44c2-9de5-fb29cfaf1647" 37 | }, { 38 | "hostname" : "2", 39 | "attributes" : { }, 40 | "resources" : { 41 | "cpus" : { 42 | "*" : 10 43 | }, 44 | "mem" : { 45 | "*" : 10000 46 | }, 47 | "ports" : { 48 | "*" : [ { 49 | "begin" : 1, 50 | "end" : 100 51 | } ] 52 | } 53 | }, 54 | "slave-id" : "4696a9c8-fc9f-46b4-8a83-23b384f3f616" 55 | }, { 56 | "hostname" : "3", 57 | "attributes" : { }, 58 | "resources" : { 59 | "cpus" : { 60 | "*" : 10 61 | }, 62 | "mem" : { 63 | "*" : 10000 64 | }, 65 | "ports" : { 66 | "*" : [ { 67 | "begin" : 1, 68 | "end" : 100 69 | } ] 70 | } 71 | }, 72 | "slave-id" : "a45be252-35f7-434a-8ec3-1e85265213ac" 73 | }, { 74 | "hostname" : "4", 75 | "attributes" : { }, 76 | "resources" : { 77 | "cpus" : { 78 | "*" : 10 79 | }, 80 | "mem" : { 81 | "*" : 10000 82 | }, 83 | "ports" : { 84 | "*" : [ { 85 | "begin" : 1, 86 | "end" : 100 87 | } ] 88 | } 89 | }, 90 | "slave-id" : "67194c23-61d6-4901-87a8-38d84af57f95" 91 | } ] -------------------------------------------------------------------------------- /scheduler/src/cook/cached_queries.clj: -------------------------------------------------------------------------------- 1 | (ns cook.cached-queries 2 | (:require [cook.cache :as ccache] 3 | [cook.caches :as caches] 4 | [cook.config :as config] 5 | [cook.datomic :as datomic] 6 | [datomic.api :as d :refer [q]])) 7 | 8 | (let [miss-fn 9 | (fn [{:keys [job/pool]}] 10 | (or (:pool/name pool) 11 | (config/default-pool) 12 | "no-pool"))] 13 | (defn job->pool-name 14 | "Return the pool name of the job. Guaranteed non nil." 15 | [job] 16 | (caches/lookup-cache-datomic-entity! caches/job-ent->pool-cache miss-fn job))) 17 | 18 | (defn job-ent->user 19 | "Given a job entity, return the user the job runs as." 20 | [job-ent] 21 | (caches/lookup-cache-datomic-entity! caches/job-ent->user-cache :job/user job-ent)) 22 | 23 | (defn instance-uuid->job-uuid-datomic-query 24 | "Queries for the job uuid from an instance uuid. 25 | Returns nil if the instance uuid doesn't correspond 26 | to a job" 27 | [db instance-uuid] 28 | (->> (d/entity db [:instance/task-id (str instance-uuid)]) 29 | :job/_instance 30 | :job/uuid)) 31 | 32 | (let [miss-fn 33 | (fn [instance-uuid] 34 | (str (instance-uuid->job-uuid-datomic-query (d/db datomic/conn) instance-uuid)))] 35 | (defn instance-uuid->job-uuid-cache-lookup 36 | "Get job-uuid from cache if it is present, else search datomic for it" 37 | [instance-uuid] 38 | (ccache/lookup-cache! caches/instance-uuid->job-uuid identity miss-fn instance-uuid))) 39 | 40 | (let [miss-fn 41 | (fn [job-uuid] 42 | (d/entity (d/db datomic/conn) [:job/uuid job-uuid]))] 43 | (defn job-uuid->job-map-cache-lookup 44 | "Get job-map from cache if it is present, else search datomic for it" 45 | [job-uuid] 46 | (ccache/lookup-cache! caches/job-uuid->job-map identity miss-fn job-uuid))) 47 | -------------------------------------------------------------------------------- /scheduler/src/cook/caches.clj: -------------------------------------------------------------------------------- 1 | (ns cook.caches 2 | (:require [chime] 3 | [cook.cache :as ccache] 4 | [cook.config :as config] 5 | [mount.core :as mount]) 6 | (:import (com.google.common.cache Cache CacheBuilder) 7 | (java.util.concurrent TimeUnit))) 8 | 9 | (defn new-cache [config] 10 | "Build a new cache" 11 | (-> (CacheBuilder/newBuilder) 12 | (.maximumSize (get-in config [:settings :cache-working-set-size])) 13 | ;; if its not been accessed in 2 hours, whatever is going on, its not being visted by the 14 | ;; scheduler loop anymore. E.g., its probably failed/done and won't be needed. So, 15 | ;; lets kick it out to keep cache small. 16 | (.expireAfterAccess 2 TimeUnit/HOURS) 17 | (.build))) 18 | 19 | (defn passport-cache [config] 20 | "Build a new passport-related cache" 21 | (-> (CacheBuilder/newBuilder) 22 | (.maximumSize (get-in config [:settings :passport :job-cache-set-size])) 23 | (.expireAfterAccess (get-in config [:settings :passport :job-cache-expiry-time-hours]) TimeUnit/HOURS) 24 | (.build))) 25 | 26 | (defn lookup-cache-datomic-entity! 27 | "Specialized function for caching where datomic entities are the key. 28 | Extracts :db/id so that we don't keep the entity alive in the cache." 29 | [cache miss-fn entity] 30 | (ccache/lookup-cache! cache :db/id miss-fn entity)) 31 | 32 | (mount/defstate ^Cache job-ent->resources-cache :start (new-cache config/config)) 33 | (mount/defstate ^Cache job-ent->pool-cache :start (new-cache config/config)) 34 | (mount/defstate ^Cache task-ent->user-cache :start (new-cache config/config)) 35 | (mount/defstate ^Cache job-ent->user-cache :start (new-cache config/config)) 36 | (mount/defstate ^Cache task->feature-vector-cache :start (new-cache config/config)) 37 | (mount/defstate ^Cache user->group-ids-cache :start (new-cache config/config)) 38 | (mount/defstate ^Cache recent-synthetic-pod-job-uuids :start 39 | (-> (CacheBuilder/newBuilder) 40 | (.maximumSize (:synthetic-pod-recency-size (config/kubernetes))) 41 | ; We blocklist a given job from being autoscaled soon after a prior autoscaling. 42 | (.expireAfterWrite (:synthetic-pod-recency-seconds (config/kubernetes)) TimeUnit/SECONDS) 43 | (.build))) 44 | (mount/defstate ^Cache pool-name->exists?-cache :start (new-cache config/config)) 45 | (mount/defstate ^Cache pool-name->accepts-submissions?-cache :start (new-cache config/config)) 46 | (mount/defstate ^Cache pool-name->db-id-cache :start (new-cache config/config)) 47 | (mount/defstate ^Cache user-and-pool-name->quota :start (new-cache config/config)) 48 | (mount/defstate ^Cache instance-uuid->job-uuid :start (passport-cache config/config)) 49 | (mount/defstate ^Cache job-uuid->job-map :start (passport-cache config/config)) -------------------------------------------------------------------------------- /scheduler/src/cook/compute_cluster/metrics.clj: -------------------------------------------------------------------------------- 1 | (ns cook.compute-cluster.metrics 2 | (:require [metrics.timers :as timers])) 3 | 4 | (defn calculate-name 5 | "Given a metric name and compute cluster name, come up with the metric path to use." 6 | [metric-name compute-cluster-name] 7 | ["cook" 8 | metric-name 9 | (str "compute-cluster-" compute-cluster-name)]) 10 | 11 | (defn timer 12 | "Given a metric name and a compute cluster name, returns a timer metric." 13 | [metric-name compute-cluster-name] 14 | (timers/timer (calculate-name metric-name compute-cluster-name))) 15 | -------------------------------------------------------------------------------- /scheduler/src/cook/curator.clj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/scheduler/src/cook/curator.clj -------------------------------------------------------------------------------- /scheduler/src/cook/kubernetes/metrics.clj: -------------------------------------------------------------------------------- 1 | (ns cook.kubernetes.metrics 2 | (:require [metrics.core :as core] 3 | [metrics.counters :as counters] 4 | [metrics.meters :as meters] 5 | [metrics.timers :as timers]) 6 | (:import (com.codahale.metrics Histogram MetricRegistry MetricRegistry$MetricSupplier SlidingTimeWindowArrayReservoir) 7 | (java.util.concurrent TimeUnit))) 8 | 9 | (defn calculate-name 10 | "Given a metric name and compute cluster name, come up with the metric path to use." 11 | [metric-name compute-cluster-name] 12 | ["cook-k8s" 13 | metric-name 14 | (str "compute-cluster-" compute-cluster-name)]) 15 | 16 | (defn counter 17 | "Given a metric name and a compute cluster name, returns a counter metric." 18 | [metric-name compute-cluster-name] 19 | (counters/counter (calculate-name metric-name compute-cluster-name))) 20 | 21 | (defn meter 22 | "Given a metric name and a compute cluster name, returns a meter metric." 23 | [metric-name compute-cluster-name] 24 | (meters/meter (calculate-name metric-name compute-cluster-name))) 25 | 26 | (defn timer 27 | "Given a metric name and a compute cluster name, returns a timer metric." 28 | [metric-name compute-cluster-name] 29 | (timers/timer (calculate-name metric-name compute-cluster-name))) 30 | 31 | (def histogram-supplier 32 | (reify 33 | MetricRegistry$MetricSupplier 34 | (newMetric [_] 35 | (Histogram. 36 | ; The default implementation of `Reservoir` in dropwizard metrics is 37 | ; `ExponentiallyDecayingReservoir`, which stores data samples for some 38 | ; time. When new samples stop arriving, it uses the historical data and 39 | ; returns the same characteristics for the data distribution again and 40 | ; again, simply because the data distribution doesn’t change. Here we 41 | ; switch from the default `ExponentiallyDecayingReservoir` to a sliding 42 | ; time window reservoir, which gives zeros when there is no data. See 43 | ; https://engineering.salesforce.com/be-careful-with-reservoirs-708884018daf 44 | ; for more information. 45 | (SlidingTimeWindowArrayReservoir. 300 TimeUnit/SECONDS))))) 46 | 47 | (defn histogram 48 | "Given a metric name and a compute cluster name, returns a histogram metric." 49 | [metric-name compute-cluster-name] 50 | (.histogram 51 | ^MetricRegistry core/default-registry 52 | (core/metric-name 53 | (calculate-name metric-name compute-cluster-name)) 54 | histogram-supplier)) 55 | -------------------------------------------------------------------------------- /scheduler/src/cook/mesos/reason.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.mesos.reason 17 | (:require [clojure.tools.logging :as log] 18 | [datomic.api :as d])) 19 | 20 | (defn reason-code->reason-entity 21 | [db reason-code] 22 | (d/entity db [:reason/code reason-code])) 23 | 24 | (defn reason-code->reason-string 25 | [db reason-code] 26 | (:reason/string (reason-code->reason-entity db reason-code))) 27 | 28 | (defn mesos-reason->cook-reason-entity-id 29 | [db task-id mesos-reason] 30 | (if-let [reason-entity-id (:db/id (d/entity db [:reason/mesos-reason mesos-reason]))] 31 | reason-entity-id 32 | (do 33 | (log/warn "Unknown mesos reason:" mesos-reason "for task" task-id) 34 | (:db/id (d/entity db [:reason/name :mesos-unknown]))))) 35 | 36 | (defn instance-entity->reason-entity 37 | [db instance] 38 | (or (:instance/reason instance) 39 | (reason-code->reason-entity db (:instance/reason-code instance)))) 40 | 41 | (defn all-known-reasons 42 | "Returns a list of Datomic entities corresponding to all 43 | of the currently defined failure reasons." 44 | [db] 45 | (map (partial d/entity db) 46 | (d/q '[:find [?e ...] 47 | :in $ 48 | :where 49 | [?e :reason/code]] 50 | db))) 51 | 52 | (defn default-failure-limit 53 | [db] 54 | (:scheduler.config/mea-culpa-failure-limit (d/entity db :scheduler/config))) 55 | -------------------------------------------------------------------------------- /scheduler/src/cook/passport.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.passport 17 | (:require [clojure.data.json :as json] 18 | [clojure.tools.logging :as log] 19 | [cook.config :as config])) 20 | 21 | (defn log-event 22 | "Log event to cook-passport log file" 23 | [{:keys [event-type] :as log-data}] 24 | (when (:enabled? (config/passport)) 25 | (log/log config/passport-logger-ns :info nil (json/write-str 26 | (assoc 27 | log-data 28 | :source :cook-scheduler 29 | :event-type (str "cook-scheduler/" (name event-type))))))) 30 | 31 | (def checkpoint-volume-mounts-key-selected :checkpoint-volume-mounts-key-selected) 32 | (def default-image-selected :default-image-selected) 33 | (def init-container-image-selected :init-container-image-selected) 34 | (def job-created :job-created) 35 | (def job-submitted :job-submitted) 36 | (def pod-completed :pod-completed) 37 | (def pod-submission-succeeded :pod-submission-succeeded) 38 | (def sidecar-image-selected :sidecar-image-selected) 39 | (def synthetic-pod-submission-succeeded :synthetic-pod-submission-succeeded) 40 | (def pod-submission-failed :pod-submission-failed) 41 | (def synthetic-pod-submission-failed :synthetic-pod-submission-failed) 42 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/adjustment.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.plugins.adjustment 17 | (:require [clojure.tools.logging :as log] 18 | [cook.config :as config] 19 | [cook.plugins.definitions :refer [JobAdjuster]] 20 | [cook.plugins.util] 21 | [mount.core :as mount])) 22 | 23 | (def no-op 24 | (reify JobAdjuster 25 | (adjust-job [_ job-map _] job-map))) 26 | 27 | (defn create-default-plugin-object 28 | "Returns the configured JobAdjuster, or a no-op if none is defined." 29 | [config] 30 | (let [factory-fn (get-in config [:settings :plugins :job-adjuster :factory-fn])] 31 | (if factory-fn 32 | (do 33 | (log/info "Creating job adjuster plugin with" factory-fn) 34 | (if-let [resolved-fn (cook.plugins.util/resolve-symbol (symbol factory-fn))] 35 | (resolved-fn config) 36 | (throw (ex-info (str "Unable to resolve factory fn " factory-fn) {})))) 37 | no-op))) 38 | 39 | (mount/defstate plugin 40 | :start (create-default-plugin-object config/config)) 41 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/completion.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.plugins.completion 17 | (:require [clojure.tools.logging :as log] 18 | [cook.config :as config] 19 | [cook.plugins.definitions :refer [InstanceCompletionHandler]] 20 | [cook.plugins.util] 21 | [mount.core :as mount])) 22 | 23 | (def no-op 24 | (reify InstanceCompletionHandler 25 | (on-instance-completion [_ _ _]))) 26 | 27 | (defn create-default-plugin-object 28 | "Returns the configured InstanceCompletionHandler, or a no-op if none is defined." 29 | [config] 30 | (let [factory-fn (get-in config [:settings :plugins :instance-completion :factory-fn])] 31 | (if factory-fn 32 | (do 33 | (log/info "Creating instance completion plugin with" factory-fn) 34 | (if-let [resolved-fn (cook.plugins.util/resolve-symbol (symbol factory-fn))] 35 | (resolved-fn config) 36 | (throw (ex-info (str "Unable to resolve factory fn " factory-fn))))) 37 | no-op))) 38 | 39 | (mount/defstate plugin 40 | :start (create-default-plugin-object config/config)) 41 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/demo_plugin.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.plugins.demo-plugin 17 | (:require [clj-time.core :as t] 18 | [clojure.string :as str] 19 | [cook.plugins.definitions :as chd])) 20 | 21 | (def uuid-seen-counts (atom {})) 22 | 23 | (defn- generate-result 24 | [result message] 25 | {:status result :message message :cache-expires-at (-> 1 t/seconds t/from-now)}) 26 | 27 | ; Demo validation plugin, designed to match with the integration tests. 28 | (defrecord DemoValidateSubmission [] 29 | chd/JobSubmissionValidator 30 | (chd/check-job-submission 31 | [this {:keys [name] :as job-map} _] 32 | (if (and name (str/starts-with? name "plugin_test.submit_fail")) 33 | (generate-result :rejected "Message1- Fail to submit") 34 | (generate-result :accepted "Message2")))) 35 | 36 | (defrecord DemoValidateSubmission2 [] 37 | chd/JobSubmissionValidator 38 | (chd/check-job-submission [this {:keys [name]} _] 39 | (if (and name (str/starts-with? name "plugin_test.submit_fail2")) 40 | (generate-result :rejected "Message5- Plugin2 failed") 41 | (generate-result :accepted "Message6")))) 42 | 43 | (defrecord DemoFilterLaunch [] 44 | chd/JobLaunchFilter 45 | (chd/check-job-launch 46 | [this {:keys [:job/name :job/uuid] :as job-map}] 47 | (let [newdict (swap! uuid-seen-counts update-in [uuid] (fnil inc 0)) 48 | seen (get newdict uuid)] 49 | (if (and name 50 | (str/starts-with? name "plugin_test.launch_defer") 51 | (<= seen 3)) 52 | (generate-result :deferred "Message3") 53 | (generate-result :accepted "Message4"))))) 54 | 55 | (defn launch-factory 56 | "Factory method for the launch-plugin to be used in config.edn" 57 | [] 58 | (->DemoFilterLaunch)) 59 | 60 | (defn submission-factory 61 | "Factory method for the submission plugin to be used in config.edn" 62 | [] 63 | (->DemoValidateSubmission)) 64 | 65 | (defn submission-factory2 66 | "Factory method for the second submission plugin to be used in config.edn" 67 | [] 68 | (->DemoValidateSubmission2)) 69 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/file.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.plugins.file 17 | (:require [clojure.tools.logging :as log] 18 | [cook.config :as config] 19 | [cook.plugins.definitions :refer [FileUrlGenerator]] 20 | [cook.plugins.util] 21 | [mount.core :as mount])) 22 | 23 | (defrecord NilFileUrlPlugin [] 24 | FileUrlGenerator 25 | (file-url [this instance] 26 | nil)) 27 | 28 | (defn create-plugin-object 29 | "Returns the configured FileUrlPlugin, or a NilFileUrlPlugin if none is defined." 30 | [config] 31 | (let [file-url (get-in config [:settings :plugins :file-url]) 32 | factory-fn (:factory-fn file-url)] 33 | (if factory-fn 34 | (do 35 | (log/info "Creating file url plugin with" factory-fn) 36 | (if-let [resolved-fn (cook.plugins.util/resolve-symbol (symbol factory-fn))] 37 | (resolved-fn config) 38 | (throw (ex-info (str "Unable to resolve factory fn " factory-fn) {})))) 39 | (NilFileUrlPlugin.)))) 40 | 41 | (mount/defstate plugin 42 | :start (create-plugin-object config/config)) 43 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/job_submission_modifier.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | 17 | (ns cook.plugins.job-submission-modifier 18 | (:require [clojure.tools.logging :as log] 19 | [cook.config :as config] 20 | [cook.plugins.definitions :refer [choose-pool-for-job modify-job JobRouter JobSubmissionModifier]] 21 | [cook.plugins.util] 22 | [mount.core :as mount])) 23 | 24 | (defn pool-name->effective-pool-name 25 | "Given a pool name and job from a submission returns the effective pool name" 26 | [pool-name-from-submission job] 27 | (if-let [job-router (config/job-routing-pool-name? pool-name-from-submission)] 28 | (choose-pool-for-job job-router job) 29 | (or pool-name-from-submission (config/default-pool)))) 30 | 31 | (defrecord IdentityJobSubmissionModifier [] 32 | JobSubmissionModifier 33 | ; The IdentityJobSubmissionModifier doesn't make any changes to what users submit except 34 | ; to add the calculated pool 35 | (modify-job [this job pool-name] 36 | (let [effective-pool-name (pool-name->effective-pool-name pool-name job)] 37 | (assoc job :pool effective-pool-name)))) 38 | 39 | (defn create-plugin-object 40 | "Returns the configured JobSubmissionModifier, or a IdentityJobSubmissionModifier if none is defined." 41 | [config] 42 | (let [factory-fn (get-in config [:settings :plugins :job-submission-modifier :factory-fn])] 43 | (if factory-fn 44 | (do 45 | (log/info "Creating job submission modifier plugin with" factory-fn) 46 | (if-let [resolved-fn (cook.plugins.util/resolve-symbol (symbol factory-fn))] 47 | (resolved-fn config) 48 | (throw (ex-info (str "Unable to resolve factory fn " factory-fn) {})))) 49 | (IdentityJobSubmissionModifier.)))) 50 | 51 | (mount/defstate plugin 52 | :start (create-plugin-object config/config)) 53 | 54 | (defn apply-job-submission-modifier-plugins 55 | "Modify a user-submitted job before passing it further down the submission pipeline." 56 | [raw-job pool-name] 57 | (modify-job plugin raw-job pool-name)) -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/pool.clj: -------------------------------------------------------------------------------- 1 | (ns cook.plugins.pool 2 | (:require [clojure.tools.logging :as log] 3 | [cook.config :as config] 4 | [cook.plugins.definitions :refer [PoolSelector]] 5 | [cook.plugins.util] 6 | [mount.core :as mount])) 7 | 8 | (defrecord AttributePoolSelector [attribute-name default-pool] 9 | PoolSelector 10 | (select-pool [this offer] 11 | (or (->> offer :attributes (filter #(= attribute-name (:name %))) first :text) 12 | default-pool))) 13 | 14 | (defn create-plugin-object 15 | "Returns the configured PoolSelector, or an AttributePoolSelector if none is defined." 16 | [config] 17 | (let [pool-selection (get-in config [:settings :plugins :pool-selection]) 18 | factory-fn (:factory-fn pool-selection)] 19 | (if factory-fn 20 | (do 21 | (log/info "Creating pool selection plugin with" factory-fn) 22 | (if-let [resolved-fn (cook.plugins.util/resolve-symbol (symbol factory-fn))] 23 | (resolved-fn config) 24 | (throw (ex-info (str "Unable to resolve factory fn " factory-fn) {})))) 25 | (AttributePoolSelector. (:attribute-name pool-selection) 26 | (:default-pool pool-selection))))) 27 | 28 | (mount/defstate plugin 29 | :start (create-plugin-object config/config)) 30 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/pool_mover.clj: -------------------------------------------------------------------------------- 1 | (ns cook.plugins.pool-mover 2 | (:require [clojure.tools.logging :as log] 3 | [cook.cached-queries :as cached-queries] 4 | [cook.config :as config] 5 | [cook.plugins.definitions :as chd] 6 | [cook.prometheus-metrics :as prom] 7 | [datomic.api :as d] 8 | [metrics.counters :as counters])) 9 | 10 | (counters/defcounter [cook-mesos plugins pool-mover jobs-migrated]) 11 | 12 | (defrecord PoolMoverJobAdjuster [pool-mover-config] 13 | chd/JobAdjuster 14 | (adjust-job [_ {:keys [job/uuid job/pool] :as job-txn} db] 15 | (let [submission-pool (-> db (d/entity pool) :pool/name (or (config/default-pool)))] 16 | (if-let [{:keys [users destination-pool]} (get pool-mover-config submission-pool)] 17 | (let [user (cached-queries/job-ent->user job-txn)] 18 | (if-let [{:keys [portion]} (get users user)] 19 | (if (and (number? portion) 20 | (> (* portion 100) (-> uuid hash (mod 100)))) 21 | (try 22 | (log/info "Moving job" uuid "(" user ") from" submission-pool "pool to" 23 | destination-pool "pool due to pool-mover configuration") 24 | (prom/inc prom/pool-mover-jobs-updated) 25 | (counters/inc! jobs-migrated) 26 | (assoc job-txn :job/pool (-> db (d/entity [:pool/name destination-pool]) :db/id)) 27 | (catch Throwable t 28 | (log/error t "Error when moving pool to" destination-pool) 29 | job-txn)) 30 | job-txn) 31 | job-txn)) 32 | job-txn)))) 33 | 34 | (defn make-pool-mover-job-adjuster 35 | [config] 36 | (let [pool-mover-config (get-in config [:settings :plugins :pool-mover])] 37 | (log/info "Configuring PoolMoverJobAdjuster" pool-mover-config) 38 | (->PoolMoverJobAdjuster pool-mover-config))) 39 | -------------------------------------------------------------------------------- /scheduler/src/cook/plugins/util.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.plugins.util 17 | (:require [clj-time.core :as t])) 18 | 19 | (defn resolve-symbol 20 | "Resolve the given symbol to the corresponding Var." 21 | [sym] 22 | (resolve (some-> sym namespace symbol require) sym)) 23 | 24 | (def positive-infinity-date 25 | (t/date-time 2999 12 31)) 26 | -------------------------------------------------------------------------------- /scheduler/src/cook/rate_limit.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.rate-limit 17 | (:require [clojure.tools.logging :as log] 18 | [cook.config :refer [config]] 19 | [cook.rate-limit.generic :as rtg] 20 | [mount.core :as mount])) 21 | 22 | ; Import from cook.rate-limit.generic some relevant functions. 23 | (def spend! rtg/spend!) 24 | (def time-until-out-of-debt-millis! rtg/time-until-out-of-debt-millis!) 25 | (def get-token-count! rtg/get-token-count!) 26 | (def enforce? rtg/enforce?) 27 | (def flush! rtg/flush!) 28 | (def AllowAllRateLimiter rtg/AllowAllRateLimiter) 29 | 30 | (defn create-job-submission-rate-limiter 31 | "From the configuration map, extract the keys that setup the job-submission rate limiter and return 32 | the constructed object. If the configuration map is not found, the AllowAllRateLimiter is returned." 33 | [config] 34 | (let [{:keys [settings]} config 35 | {:keys [rate-limit]} settings 36 | {:keys [expire-minutes job-submission]} rate-limit] 37 | (if (seq job-submission) 38 | (rtg/make-tbf-rate-limiter (assoc job-submission :expire-minutes expire-minutes)) 39 | AllowAllRateLimiter))) 40 | 41 | (mount/defstate job-submission-rate-limiter 42 | :start (create-job-submission-rate-limiter config)) 43 | 44 | (defn create-compute-cluster-launch-rate-limiter 45 | "From the configuration map, extract the keys that setup the job-launch rate limiter and return 46 | the constructed object. If the configuration map is not found, the AllowAllRateLimiter is returned." 47 | [compute-cluster-name compute-cluster-launch-rate-limits] 48 | (if (seq compute-cluster-launch-rate-limits) 49 | (do 50 | (log/info "For compute cluster" compute-cluster-name "configuring global rate limit config" compute-cluster-launch-rate-limits) 51 | (rtg/make-tbf-rate-limiter compute-cluster-launch-rate-limits)) 52 | (do 53 | (log/info "For compute cluster" compute-cluster-name "not configuring global rate limit because no configuration set") 54 | AllowAllRateLimiter))) 55 | 56 | (def compute-cluster-launch-rate-limiter-key "*DEF*") 57 | 58 | -------------------------------------------------------------------------------- /scheduler/src/cook/regexp_tools.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.regexp-tools 17 | (:require [clojure.tools.logging :as log])) 18 | 19 | (defn match-based-on-regexp 20 | "Given a list of dictionaries [{: : } {: : } ...], match-list, 21 | a key and name, return the first matching where the matches the key." 22 | [regexp-name field-name match-list key] 23 | (try 24 | (-> match-list 25 | (->> (filter (fn [map] 26 | (let [regexp (get map regexp-name) 27 | pattern (re-pattern regexp)] 28 | (re-find pattern key))))) 29 | first 30 | (get field-name)) 31 | (catch Exception e 32 | (throw (ex-info "Failed matching key" {:regexp-name regexp-name :field-name field-name :match-list match-list :key key} e))))) 33 | 34 | (defn match-based-on-pool-name 35 | "Given a list of dictionaries [{:pool-regexp .. :field ...} {:pool-regexp .. :field ...} 36 | a pool name and a name, return the first matching where the regexp matches the pool name." 37 | [match-list effective-pool-name field & {:keys [default-value] :or {default-value nil}}] 38 | (let [value (match-based-on-regexp 39 | :pool-regex 40 | field 41 | match-list 42 | effective-pool-name)] 43 | (if (some? value) 44 | value 45 | default-value))) 46 | -------------------------------------------------------------------------------- /scheduler/src/cook/reporter.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.reporter 17 | (:require [clojure.tools.logging :as log] 18 | [datomic.api :refer [q]] 19 | [metatransaction.core :refer [db]] 20 | [metrics.core :as metrics]) 21 | (:import (com.codahale.metrics ConsoleReporter MetricFilter) 22 | (com.codahale.metrics.graphite Graphite GraphiteReporter PickledGraphite) 23 | (java.net InetSocketAddress) 24 | (java.util.concurrent TimeUnit))) 25 | 26 | ;; the default registry 27 | (def registry metrics/default-registry) 28 | 29 | (defn jmx-reporter 30 | [] 31 | (.. (com.codahale.metrics.jmx.JmxReporter/forRegistry metrics/default-registry) 32 | (build) 33 | (start))) 34 | 35 | (defn graphite-reporter 36 | [{:keys [prefix host port pickled?]}] 37 | (log/info "Starting graphite reporter") 38 | (let [addr (InetSocketAddress. host port) 39 | graphite (if pickled? 40 | (PickledGraphite. addr) 41 | (Graphite. addr))] 42 | (doto (.. (GraphiteReporter/forRegistry metrics/default-registry) 43 | (prefixedWith prefix) 44 | (filter MetricFilter/ALL) 45 | (convertRatesTo TimeUnit/SECONDS) 46 | (convertDurationsTo TimeUnit/MILLISECONDS) 47 | (build graphite)) 48 | (.start 30 TimeUnit/SECONDS)))) 49 | 50 | (defn console-reporter 51 | "Creates and starts a ConsoleReporter for metrics" 52 | [] 53 | (doto (.. (ConsoleReporter/forRegistry metrics/default-registry) 54 | (convertRatesTo TimeUnit/SECONDS) 55 | (convertDurationsTo TimeUnit/MILLISECONDS) 56 | (build)) 57 | (.start 30 TimeUnit/SECONDS))) 58 | -------------------------------------------------------------------------------- /scheduler/src/cook/rest/cors.clj: -------------------------------------------------------------------------------- 1 | (ns cook.rest.cors) 2 | 3 | (defn preflight? 4 | [{:keys [request-method]}] 5 | (= :options request-method)) 6 | 7 | (defn same-origin? 8 | "Returns true if the request is from the same origin as the provided origin header" 9 | [{:keys [headers scheme]}] 10 | (let [{:strs [host origin x-forwarded-proto]} headers 11 | forwarded-or-scheme (or x-forwarded-proto 12 | (when scheme (name scheme)))] 13 | (when (and host origin forwarded-or-scheme) 14 | (= origin (str forwarded-or-scheme "://" host))))) 15 | 16 | (defn request-allowed? 17 | "Returns true if the request is either from the same origin or matches a pattern in cors-origins. 18 | The request should have a non-nil origin header." 19 | [req cors-origins] 20 | (or (same-origin? req) 21 | (let [origin (get-in req [:headers "origin"])] 22 | (some #(re-matches % origin) cors-origins)))) 23 | 24 | (defn wrap-preflight 25 | "Middleware for supporting CORS preflight requests" 26 | [handler cors-origins] 27 | (fn preflight-handler [{:keys [headers] :as req}] 28 | (let [{:strs [origin]} headers] 29 | (if (and (preflight? req) origin) 30 | (if (request-allowed? req cors-origins) 31 | (let [{:strs [access-control-request-headers]} headers] 32 | {:status 200 33 | :headers {"Access-Control-Allow-Credentials" "true" 34 | "Access-Control-Allow-Headers" access-control-request-headers 35 | "Access-Control-Allow-Methods" "PUT, GET, OPTIONS, DELETE" 36 | "Access-Control-Allow-Origin" origin 37 | "Access-Control-Max-Age" "86400"}}) ; 1 day 38 | {:status 403 39 | :body (str "Origin " origin " not allowed")}) 40 | (handler req))))) 41 | 42 | (defn wrap-cors 43 | "Middleware for supporting CORS requests" 44 | [handler cors-origins] 45 | (fn cors-handler [{:keys [headers] :as req}] 46 | (let [{:strs [origin]} headers] 47 | (if origin 48 | (if (request-allowed? req cors-origins) 49 | (let [resp (handler req)] 50 | (update-in resp [:headers] assoc 51 | "Access-Control-Allow-Credentials" "true" 52 | "Access-Control-Allow-Origin" origin)) 53 | {:status 403 54 | :body (str "Cross origin request denied from " origin)}) 55 | (handler req))))) ; If no origin is provided, pass the request through. 56 | 57 | (defn cors-middleware 58 | "Wraps the provided handler with wrap-cors and wrap-preflight in the correct order" 59 | [handler cors-origins] 60 | (-> handler 61 | (wrap-cors cors-origins) 62 | (wrap-preflight cors-origins))) 63 | -------------------------------------------------------------------------------- /scheduler/src/cook/rest/secret.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.rest.secret 17 | "This namespace is for development. It uses a fake secret." 18 | (:require [ring.middleware.params])) 19 | 20 | (defn wrap-terribly-insecure-auth 21 | [handler] 22 | (fn [{{user "user"} :params :as req}] 23 | (handler (assoc req :authorization/user user)))) 24 | 25 | (defn authorization-middleware 26 | [auth] 27 | (-> auth 28 | (wrap-terribly-insecure-auth))) 29 | -------------------------------------------------------------------------------- /scheduler/src/fork/metrics_clojure/LICENSE.markdown: -------------------------------------------------------------------------------- 1 | MIT/X11 License 2 | =============== 3 | 4 | Copyright (c) 2011-2017 Steve Losh and contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | this software and associated documentation files (the "Software"), to deal in 8 | the Software without restriction, including without limitation the rights to 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software is furnished to do so, 11 | subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 18 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 20 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /scheduler/src/fork/metrics_clojure/README.txt: -------------------------------------------------------------------------------- 1 | This code is copied from https://github.com/metrics-clojure/metrics-clojure 2 | and git hash a1dbacc748a1f8165f0094e2229c84f228efe29b 3 | 4 | We need the unreleased 3.0.0x branch for support for >JDK8. We modified the file's package name. 5 | -------------------------------------------------------------------------------- /scheduler/src/fork/metrics_clojure/metrics/jvm/core.clj: -------------------------------------------------------------------------------- 1 | (ns fork.metrics-clojure.metrics.jvm.core 2 | (:import (com.codahale.metrics MetricRegistry) 3 | (com.codahale.metrics.jvm ThreadStatesGaugeSet GarbageCollectorMetricSet FileDescriptorRatioGauge 4 | MemoryUsageGaugeSet JvmAttributeGaugeSet)) 5 | (:require [metrics.core :refer [add-metric default-registry]])) 6 | 7 | (defn register-jvm-attribute-gauge-set 8 | ([^MetricRegistry reg] 9 | (register-jvm-attribute-gauge-set reg ["jvm" "attribute"])) 10 | ([^MetricRegistry reg title] 11 | (add-metric reg title (new JvmAttributeGaugeSet)))) 12 | 13 | (defn register-memory-usage-gauge-set 14 | ([^MetricRegistry reg] 15 | (register-memory-usage-gauge-set reg ["jvm" "memory"])) 16 | ([^MetricRegistry reg title] 17 | (add-metric reg title (new MemoryUsageGaugeSet)))) 18 | 19 | (defn register-file-descriptor-ratio-gauge-set 20 | ([^MetricRegistry reg] 21 | (register-file-descriptor-ratio-gauge-set reg ["jvm" "file"])) 22 | ([^MetricRegistry reg title] 23 | (add-metric reg title (new FileDescriptorRatioGauge)))) 24 | 25 | (defn register-garbage-collector-metric-set 26 | ([^MetricRegistry reg] 27 | (register-garbage-collector-metric-set reg ["jvm" "gc"])) 28 | ([^MetricRegistry reg title] 29 | (add-metric reg title (new GarbageCollectorMetricSet)))) 30 | 31 | (defn register-thread-state-gauge-set 32 | ([^MetricRegistry reg] 33 | (register-thread-state-gauge-set reg ["jvm" "thread"])) 34 | ([^MetricRegistry reg title] 35 | (add-metric reg title (new ThreadStatesGaugeSet)))) 36 | 37 | (defn instrument-jvm 38 | ([] 39 | (instrument-jvm default-registry)) 40 | ([^MetricRegistry reg] 41 | (doseq [register-metric-set [register-jvm-attribute-gauge-set 42 | register-memory-usage-gauge-set 43 | register-file-descriptor-ratio-gauge-set 44 | register-garbage-collector-metric-set 45 | register-thread-state-gauge-set]] 46 | (register-metric-set reg)))) 47 | -------------------------------------------------------------------------------- /scheduler/test-resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | log4j.appender.A1=org.apache.log4j.RollingFileAppender 3 | log4j.appender.A1.File=test-log/app.log 4 | log4j.appender.A1.MaxFileSize=500MB 5 | log4j.appender.A1.MaxBackupIndex=2 6 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p%c - %m%n -------------------------------------------------------------------------------- /scheduler/test/cook/test/components.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.test.components 17 | (:require [clojure.test :refer :all] 18 | [cook.components :as components])) 19 | 20 | (deftest test-health-check-middleware 21 | (let [handler (fn [_] "Called handler!") 22 | debug-request {:uri "/debug" 23 | :request-method :get}] 24 | 25 | (testing "always returns 200 if leader-reports-unhealthy is false" 26 | (let [leadership-atom (atom false) 27 | middleware (components/health-check-middleware handler 28 | leadership-atom 29 | false)] 30 | (is (= 200 (:status (middleware debug-request)))) 31 | (swap! leadership-atom (constantly true)) 32 | (is (= 200 (:status (middleware debug-request)))))) 33 | 34 | (testing "returns 503 when leader" 35 | (let [leadership-atom (atom false) 36 | middleware (components/health-check-middleware handler 37 | leadership-atom 38 | true)] 39 | (is (= 200 (:status (middleware debug-request)))) 40 | (swap! leadership-atom (constantly true)) 41 | (is (= 503 (:status (middleware debug-request)))))) 42 | 43 | (testing "passes other requests to handler" 44 | (let [leadership-atom (atom false) 45 | middleware (components/health-check-middleware handler 46 | leadership-atom 47 | true)] 48 | (is (= "Called handler!" (middleware {:uri "/real-request"}))) 49 | (swap! leadership-atom (constantly true)) 50 | (is (= "Called handler!" (middleware {:uri "/real-request"}))))))) 51 | 52 | -------------------------------------------------------------------------------- /scheduler/test/cook/test/log_structured.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.test.log_structured 17 | (:require [clojure.test :refer :all] 18 | [cook.log-structured :as log-structured]) 19 | (:import (java.util UUID))) 20 | 21 | (deftest test-level-disabled 22 | "Tests that functions are not evaluated at a disabled log level." 23 | (let [fn-was-called (atom false) 24 | shouldnt-be-called (fn [] (reset! fn-was-called true))] 25 | (log-structured/debug (str (shouldnt-be-called)) {:test (shouldnt-be-called)}) 26 | (is (= @fn-was-called false)))) 27 | 28 | (deftest test-level-enabled 29 | "Tests that functions are evaluated as expected at an enabled log level." 30 | (let [fn-was-called (atom false) 31 | should-be-called (fn [] (reset! fn-was-called true))] 32 | (log-structured/info (str (should-be-called)) {:test (should-be-called)}) 33 | (is (= @fn-was-called true)))) 34 | 35 | (deftest test-not-json-compatible 36 | "Tests that passing a value that cannot be converted to json works as expected." 37 | (log-structured/info "some message" {:uuid (UUID/randomUUID)})) 38 | -------------------------------------------------------------------------------- /scheduler/test/cook/test/mesos/reason.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.test.mesos.reason 17 | (:use clojure.test) 18 | (:require [cook.mesos.reason :as r] 19 | [cook.test.postgres] 20 | [cook.test.testutil :refer (restore-fresh-database!)] 21 | [datomic.api :as d :refer (q db)])) 22 | 23 | (use-fixtures :once cook.test.postgres/with-pg-db) 24 | 25 | (deftest reasons-api 26 | (let [conn (restore-fresh-database! "datomic:mem://mesos-api-test") 27 | default-limit 10 28 | ;; set default failure limit. 29 | _ @(d/transact conn [{:db/id :scheduler/config 30 | :scheduler.config/mea-culpa-failure-limit 31 | default-limit}]) 32 | db (d/db conn)] 33 | 34 | (testing "all-known-failure-reasons" 35 | (let [reasons (r/all-known-reasons db)] 36 | (doseq [reason reasons] 37 | ;; testing in any more detail is overkill; it would amount 38 | ;; to proving that the schema is the schema 39 | (is (instance? datomic.Entity reason))))) 40 | 41 | (testing "default-failure-limit" 42 | (is (= default-limit (r/default-failure-limit db)))))) 43 | -------------------------------------------------------------------------------- /scheduler/test/cook/test/plugins/job_submission_modifier.clj: -------------------------------------------------------------------------------- 1 | (ns cook.test.plugins.job-submission-modifier 2 | (:require [clojure.test :refer :all] 3 | [cook.plugins.definitions :as plugins] 4 | [cook.plugins.job-submission-modifier :as job-mod])) 5 | 6 | (deftest test-identity-add-pool 7 | (let [mod-plugin (job-mod/->IdentityJobSubmissionModifier) 8 | job {} 9 | pool-name "my-pool"] 10 | (is (= "my-pool" (get (plugins/modify-job mod-plugin job pool-name) :pool) )))) 11 | 12 | (defrecord TestJobModifier [] 13 | plugins/JobSubmissionModifier 14 | (modify-job [this job pool-name] 15 | (throw (IllegalArgumentException. "TestJobModifier always throws")))) 16 | 17 | (deftest test-raise-exception 18 | ; On its own, this test has little value. We are more interested in the overall 19 | ; behavior when a real job is submitted and the plugin raises an exception. 20 | (let [mod-plugin (TestJobModifier.)] 21 | (is (thrown? IllegalArgumentException (plugins/modify-job mod-plugin nil nil))))) -------------------------------------------------------------------------------- /scheduler/test/cook/test/plugins/pool.clj: -------------------------------------------------------------------------------- 1 | (ns cook.test.plugins.pool 2 | (:require [clojure.test :refer :all] 3 | [cook.plugins.definitions :as plugins] 4 | [cook.plugins.pool :as pool])) 5 | 6 | (deftest test-attribute-pool-selector 7 | (let [selector (pool/->AttributePoolSelector "test-attribute" "my-pool")] 8 | (is (= "my-pool" (plugins/select-pool selector {}))) 9 | (is (= "my-pool" (plugins/select-pool selector {:attributes [{:name "cook-pool" 10 | :text "a-pool"}]}))) 11 | (is (= "a-pool" (plugins/select-pool selector {:attributes [{:name "test-attribute" 12 | :text "a-pool"}]}))) 13 | (is (= "b-pool" (plugins/select-pool selector {:attributes [{:name "test-attribute" 14 | :text "b-pool"}]}))))) 15 | -------------------------------------------------------------------------------- /scheduler/test/cook/test/pool.clj: -------------------------------------------------------------------------------- 1 | (ns cook.test.pool 2 | (:require [clojure.test :refer :all] 3 | [cook.config :as config] 4 | [cook.pool :as pool] 5 | [cook.test.postgres]) 6 | (:import (clojure.lang ExceptionInfo))) 7 | 8 | (use-fixtures :once cook.test.postgres/with-pg-db) 9 | 10 | (deftest test-guard-invalid-default-pool 11 | (with-redefs [pool/all-pools (constantly [{:pool/name "foo"}]) 12 | config/default-pool (constantly "foo")] 13 | (is (nil? (pool/guard-invalid-default-pool nil)))) 14 | (with-redefs [pool/all-pools (constantly []) 15 | config/default-pool (constantly nil)] 16 | (is (nil? (pool/guard-invalid-default-pool nil)))) 17 | (with-redefs [pool/all-pools (constantly [{}]) 18 | config/default-pool (constantly nil)] 19 | (is (thrown-with-msg? ExceptionInfo 20 | #"There are pools in the database, but no default pool is configured" 21 | (pool/guard-invalid-default-pool nil)))) 22 | (with-redefs [pool/all-pools (constantly []) 23 | config/default-pool (constantly "foo")] 24 | (is (thrown-with-msg? ExceptionInfo 25 | #"There is no pool in the database matching the configured default pool" 26 | (pool/guard-invalid-default-pool nil)))) 27 | (with-redefs [pool/all-pools (constantly [{:pool/name "bar"}]) 28 | config/default-pool (constantly "foo")] 29 | (is (thrown-with-msg? ExceptionInfo 30 | #"There is no pool in the database matching the configured default pool" 31 | (pool/guard-invalid-default-pool nil))))) -------------------------------------------------------------------------------- /scheduler/test/cook/test/rest/basic_auth.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.test.rest.basic-auth 17 | (:require [clojure.test :refer :all] 18 | [cook.rest.basic-auth :as basic-auth])) 19 | 20 | (deftest make-user-password-valid?-test 21 | (testing "none" 22 | (let [user-password-valid? (basic-auth/make-user-password-valid? :none true)] 23 | (is (= true (user-password-valid?))) 24 | (is (= true (user-password-valid? "lol"))) 25 | (is (= true (user-password-valid? "lol" :banana))))) 26 | (testing "config-file" 27 | (let [user-password-valid? (basic-auth/make-user-password-valid? :config-file 28 | {:valid-logins #{["abc" "123"] 29 | ["wyegelwe" "lol"]}})] 30 | (is (= true (user-password-valid? "abc" "123"))) 31 | (is (= true (user-password-valid? "wyegelwe" "lol"))) 32 | (is (= false (user-password-valid? "anything" "else")))))) 33 | -------------------------------------------------------------------------------- /scheduler/test/cook/test/scheduler/optimizer.clj: -------------------------------------------------------------------------------- 1 | (ns cook.test.scheduler.optimizer 2 | (:require [clojure.test :refer :all] 3 | [cook.scheduler.optimizer :as optimizer])) 4 | 5 | ;; Tests to make sure data flows and validates properly 6 | (deftest test-optimizer-cycle 7 | (let [host {:count 1 8 | :instance-type "small" 9 | :cpus 1 10 | :mem 1000} 11 | host-feed (reify optimizer/HostFeed 12 | (get-available-host-info [this] 13 | [host])) 14 | optimizer (reify optimizer/Optimizer 15 | (produce-schedule [this queue running available [host-info & host-infos]] 16 | {0 {:suggested-matches {host-info (map :job/uuid queue)}}})) 17 | queue [{:job/uuid (java.util.UUID/randomUUID)} {:job/uuid (java.util.UUID/randomUUID)}] 18 | schedule (optimizer/optimizer-cycle! (fn get-queue [] queue) 19 | (fn get-running [] []) 20 | (fn get-offers [] []) 21 | host-feed 22 | optimizer)] 23 | (is (= (count schedule) 1)) 24 | (is (= (first (keys (get-in schedule [0 :suggested-matches]))) 25 | host)))) 26 | -------------------------------------------------------------------------------- /scheduler/test/cook/test/util.clj: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) Two Sigma Open Source, LLC 3 | ;; 4 | ;; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ;; you may not use this file except in compliance with the License. 6 | ;; You may obtain a copy of the License at 7 | ;; 8 | ;; http://www.apache.org/licenses/LICENSE-2.0 9 | ;; 10 | ;; Unless required by applicable law or agreed to in writing, software 11 | ;; distributed under the License is distributed on an "AS IS" BASIS, 12 | ;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ;; See the License for the specific language governing permissions and 14 | ;; limitations under the License. 15 | ;; 16 | (ns cook.test.util 17 | (:require [clojure.test :refer :all] 18 | [cook.util :refer :all]) 19 | (:import (java.util UUID))) 20 | 21 | 22 | (deftest test-diff-map-keys 23 | (is (= [#{:b} #{:c} #{:a :d}] 24 | (diff-map-keys {:a {:a :a} 25 | :b {:b :b} 26 | :d {:d :d}} 27 | {:a {:a :a} 28 | :c {:c :c} 29 | :d {:d :e}}))) 30 | (is (= [nil #{:c} #{:a :d}] 31 | (diff-map-keys {:a {:a :a} 32 | :d {:d :d}} 33 | {:a {:a :a} 34 | :c {:c :c} 35 | :d {:d :e}}))) 36 | (is (= [#{:b} nil #{:a :d}] 37 | (diff-map-keys {:a {:a :a} 38 | :b {:b :b} 39 | :d {:d :d}} 40 | {:a {:a :a} 41 | :d {:d :e}})))) 42 | 43 | (deftest test-deep-merge-with 44 | (is (= {:a {:b {:z 3, :c 3, :d {:z 9, :x 1, :y 2}}, :e 103}, :f 4} 45 | (deep-merge-with + 46 | {:a {:b {:c 1 :d {:x 1 :y 2}} :e 3} :f 4} 47 | {:a {:b {:c 2 :d {:z 9} :z 3} :e 100}}))) 48 | (is (= {"foo" 2} 49 | (deep-merge-with - {"foo" 3} {"foo" 1}))) 50 | (is (thrown? NullPointerException 51 | (deep-merge-with - {"foo" nil} {"foo" 1}))) 52 | (is (thrown? NullPointerException 53 | (deep-merge-with - {"foo" 1} {"foo" nil})))) 54 | 55 | 56 | (deftest test-set-atom! 57 | (let [state (atom {})] 58 | (is (= @state {})) 59 | (is (= (set-atom! state "a") {})) 60 | (is (= (set-atom! state {:a :b}) "a")) 61 | (is (= @state {:a :b})))) 62 | 63 | (deftest test-format-map-for-structured-logging 64 | "Tests that the format-map-for-structured logging preserves nested maps." 65 | (let [uuid (UUID/randomUUID) 66 | map {:integer 2 :float 1.2 :string "foo" :uuid uuid :nested-map {:nested-string "bar" :nested-int 3}} 67 | formatted-map (format-map-for-structured-logging map)] 68 | (is (= {:integer 2 :float 1.2 :string "foo" :uuid (str uuid) :nested-map {:nested-string "bar" :nested-int 3}} formatted-map)))) 69 | -------------------------------------------------------------------------------- /scheduler/travis/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Install the current version of the jobclient 6 | pushd ${GITHUB_WORKSPACE}/jobclient/java 7 | mvn install 8 | popd 9 | 10 | # Install lein dependencies 11 | lein with-profiles +test deps 12 | 13 | -------------------------------------------------------------------------------- /sidecar/.dockerignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | *.egg-info 4 | *.egg/ 5 | *.pyc 6 | *.swp 7 | __pycache__ 8 | -------------------------------------------------------------------------------- /sidecar/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.egg-info 3 | dist/ 4 | build/ 5 | -------------------------------------------------------------------------------- /sidecar/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-alpine 2 | 3 | COPY cook /app/cook 4 | COPY Dockerfile /app 5 | COPY README.md /app 6 | COPY setup.py /app 7 | WORKDIR /app 8 | RUN pip3 install -e . 9 | CMD ["cook-sidecar", "--file-server-port", "8080"] 10 | -------------------------------------------------------------------------------- /sidecar/README.md: -------------------------------------------------------------------------------- 1 | A python file server that replicates part of the Mesos `files` endpoint API for backwards compatibility. 2 | 3 | See http://mesos.apache.org/documentation/latest/endpoints/files/download/ 4 | http://mesos.apache.org/documentation/latest/endpoints/files/read/ and 5 | http://mesos.apache.org/documentation/latest/endpoints/files/browse/ 6 | 7 | ## Building 8 | 9 | pip install dependencies: 10 | 11 | ```bash 12 | $ pip3 install -e . 13 | ``` 14 | 15 | ## Running 16 | 17 | Usage: 18 | 19 | The `COOK_WORKDIR` environment variable must be set. Only files with `COOK_WORKDIR` as the root will be served. 20 | 21 | ``` 22 | cook-sidecar --file-server-port PORT 23 | ``` 24 | 25 | Run `cook-sidecar --help` for full usage documentation. 26 | 27 | Examples: 28 | 29 | ```bash 30 | $ cook-sidecar --file-server-port 8000 31 | ``` 32 | -------------------------------------------------------------------------------- /sidecar/cook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/sidecar/cook/__init__.py -------------------------------------------------------------------------------- /sidecar/cook/sidecar/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twosigma/Cook/e43416aea1ff47b667101d275464ba45541f982e/sidecar/cook/sidecar/__init__.py -------------------------------------------------------------------------------- /sidecar/cook/sidecar/exit_sentinel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (c) 2020 Two Sigma Open Source, LLC 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | # 23 | """Cook sidecar exit sentinel file watcher thread logic.""" 24 | 25 | import logging 26 | import os 27 | import signal 28 | import threading 29 | import time 30 | 31 | def watch_for_file(sentinel_file_path, started_event): 32 | def daemon_routine(): 33 | # wait for other components to finish starting 34 | logging.info(f'Waiting for all components to start...') 35 | started_event.wait() 36 | # wait for sentinel file to appear 37 | logging.info(f'Watching for sentinel file: {sentinel_file_path}') 38 | while not os.path.exists(sentinel_file_path): 39 | time.sleep(0.1) 40 | # trigger this process's termination handler 41 | logging.info(f'Sidecar termination triggered by sentinel file: {sentinel_file_path}') 42 | os.kill(os.getpid(), signal.SIGTERM) 43 | threading.Thread(target=daemon_routine, args=(), daemon=True).start() 44 | -------------------------------------------------------------------------------- /sidecar/cook/sidecar/util.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Two Sigma Open Source, LLC 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to 6 | # deal in the Software without restriction, including without limitation the 7 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | # sell copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20 | # IN THE SOFTWARE. 21 | # 22 | 23 | import logging 24 | import os 25 | import sys 26 | 27 | def init_logging(): 28 | log_level = os.environ.get('EXECUTOR_LOG_LEVEL', 'INFO') 29 | logging.basicConfig(level = log_level, 30 | stream = sys.stderr, 31 | format='%(asctime)s %(levelname)s %(message)s') 32 | -------------------------------------------------------------------------------- /sidecar/cook/sidecar/version.py: -------------------------------------------------------------------------------- 1 | VERSION = '1.2.3' 2 | -------------------------------------------------------------------------------- /sidecar/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import setup 4 | 5 | from cook.sidecar import version 6 | 7 | requirements = [ 8 | 'flask~=1.1.0', 9 | 'gunicorn~=20.1.0', 10 | 'requests~=2.27.0', 11 | ] 12 | 13 | test_requirements = [ 14 | ] 15 | 16 | setup( 17 | name='cook_sidecar', 18 | version=version.VERSION, 19 | description="Two Sigma's Cook Sidecar", 20 | long_description="The Cook Sidecar provides sandbox file access and progress reporting.", 21 | packages=['cook.sidecar'], 22 | entry_points={'console_scripts': ['cook-sidecar = cook.sidecar.__main__:main']}, 23 | install_requires=requirements, 24 | tests_require=test_requirements 25 | ) 26 | -------------------------------------------------------------------------------- /simulator/config/settings.edn: -------------------------------------------------------------------------------- 1 | {:sim-db-uri "datomic:free://localhost:4334/cook-sim" 2 | :cook-db-uri "datomic:free://localhost:4334/cook-jobs" 3 | :cook-api-uri "http://localhost:12321" 4 | :process-count 10 5 | :sim-model {:label "Ten second test" 6 | :duration-seconds 10 7 | :user-profiles 8 | [{:description "Profile 1" 9 | :usernames ["testuser1" "testuser2"] 10 | :docker-tendency 0.5 11 | :group-tendency 0.5 12 | :group-size {:mean 3 13 | :std-dev 1 14 | :floor 2 15 | :ceiling 6} 16 | :seconds-between-jobs {:mean 3 17 | :std-dev 1 18 | :floor 1 19 | :ceiling 120} 20 | :job-duration {:mean 10 21 | :std-dev 2 22 | :floor 1 23 | :ceiling 120} 24 | :job-memory {:mean 512 25 | :std-dev 200 26 | :floor 1 27 | :ceiling 2048} 28 | :job-cpu {:mean 2.0 29 | :std-dev 1.0 30 | :floor 1.0 31 | :ceiling 4.0}}]}} 32 | -------------------------------------------------------------------------------- /simulator/project.clj: -------------------------------------------------------------------------------- 1 | (defproject cook/sim "0.1.0-SNAPSHOT" 2 | :description "Simulation tests for Cook" 3 | :dependencies [[org.clojure/clojure "1.8.0"] 4 | [clj-time "0.9.0"] 5 | [cheshire "5.5.0"] 6 | [com.datomic/datomic-free "0.9.5344" 7 | :exclusions [org.clojure/clojure joda-time]] 8 | [com.datomic/simulant "0.1.8"] 9 | [org.clojure/math.numeric-tower "0.0.4"] 10 | [com.stuartsierra/component "0.3.1"] 11 | [org.clojure/data.generators "0.1.2"] 12 | [org.clojure/tools.cli "0.3.3"] 13 | [org.clojure/algo.generic "0.1.2"] 14 | ;; [reloaded.repl "0.2.1"] 15 | [clj-http "2.0.1"] 16 | [prismatic/schema "1.1.3"] 17 | [robert/bruce "0.8.0"] 18 | [incanter "1.5.7"]] 19 | :resource-paths ["resources"] 20 | :main cook.sim.cli 21 | :source-paths ["src/main"] 22 | :profiles {:dev {:source-paths ["src/dev"] 23 | :repl-options {:init-ns cook.sim.repl} 24 | :dependencies [[reloaded.repl "0.2.1"]]}}) 25 | -------------------------------------------------------------------------------- /simulator/src/main/cook/sim/database.clj: -------------------------------------------------------------------------------- 1 | (ns cook.sim.database 2 | (:require [clojure.java.io :as io] 3 | [datomic.api :as d])) 4 | 5 | (defn recreate-database! 6 | "Given a Datomic database uri, deletes any existing database at the database, 7 | and creates a new one." 8 | [uri] 9 | (d/delete-database uri) 10 | (d/create-database uri)) 11 | 12 | (defn load-schema 13 | "Given a Datomic database connection and an IO resource location (e.g. filename), 14 | transacts the contents of the file." 15 | [conn resource] 16 | (let [m (-> resource io/resource slurp read-string)] 17 | (doseq [v (vals m) 18 | tx v] 19 | @(d/transact conn tx)))) 20 | 21 | (defn setup-database! 22 | "Given the settings, loads both the Simulant schema and extensions to support 23 | Cook Simulator specifically into the database at :sim-db-uri" 24 | [settings] 25 | (prn "setting up the schema...") 26 | (-> settings :sim-db-uri recreate-database!) 27 | (let [conn (-> settings :sim-db-uri d/connect)] 28 | (load-schema conn "simulant/schema.edn") 29 | (load-schema conn "job_schedule.edn"))) 30 | -------------------------------------------------------------------------------- /simulator/src/main/cook/sim/system.clj: -------------------------------------------------------------------------------- 1 | (ns cook.sim.system 2 | (:require [clojure.edn :as edn] 3 | [com.stuartsierra.component :as component] 4 | [datomic.api :as d] 5 | )) 6 | 7 | (defrecord Config [path settings] 8 | component/Lifecycle 9 | 10 | (start [component] 11 | (println "Loading settings from " path) 12 | (assoc component :settings (-> path slurp edn/read-string))) 13 | 14 | (stop [component] 15 | (assoc component :settings nil))) 16 | 17 | (defn new-config 18 | "Base system component; encapsulates application configuration. Config 19 | is loaded from an edn file." 20 | [path] 21 | (map->Config {:path path})) 22 | 23 | 24 | (defrecord SimDb [config conn] 25 | component/Lifecycle 26 | 27 | (start [component] 28 | (println "Connecting to simulation database...") 29 | (assoc component :conn (-> config :settings :sim-db-uri d/connect))) 30 | 31 | (stop [component] 32 | (assoc component :conn nil))) 33 | 34 | (defn new-sim-db 35 | "SimDb is a Datomic database that stores everything the Simulator wants to remember 36 | about simulations - workload descriptors, the users therein, the jobs those users 37 | will request during a simulation, etc." 38 | ([] (map->SimDb {})) 39 | ([config] (map->SimDb {:config config}))) 40 | 41 | 42 | (defrecord CookDb [config conn] 43 | component/Lifecycle 44 | 45 | (start [component] 46 | (println "Connecting to Cook database...") 47 | (assoc component :conn (-> config :settings :cook-db-uri d/connect))) 48 | 49 | (stop [component] 50 | (assoc component :conn nil))) 51 | 52 | (defn new-cook-db 53 | "CookDb is a reference to Cook Scheduler's own Datomic database. Many functions 54 | of the Simulator depend on having a connection available to this database. For 55 | example, the Cook database is queried to figure out what happened to various jobs 56 | in a Simulation in order to analyze how the Scheduler performed." 57 | ([] (map->CookDb {})) 58 | ([config] (map->CookDb {:config config}))) 59 | 60 | 61 | (defn system 62 | "Top level access point for all of the system components." 63 | [config-path] 64 | (component/system-map 65 | :config (new-config config-path) 66 | :sim-db (component/using (new-sim-db) [:config]) 67 | :cook-db (component/using (new-cook-db) [:config]))) 68 | -------------------------------------------------------------------------------- /simulator/src/main/cook/sim/util.clj: -------------------------------------------------------------------------------- 1 | (ns cook.sim.util 2 | (:require [datomic.api :as d])) 3 | 4 | (defn transaction-times 5 | "Given a Datomic db snapshot and an entity id, returns the times associated with 6 | all transactions affecting the entity." 7 | [db eid] 8 | (->> (d/q '[:find ?instant 9 | :in $ ?e 10 | :where 11 | [?e _ _ ?tx] 12 | [?tx :db/txInstant ?instant]] 13 | (d/history db) eid) 14 | (map first) 15 | (sort))) 16 | 17 | (defn created-at 18 | "Given a Datomic db snapshot and an entity id, returns the time when the entity 19 | was first created (first transaction)." 20 | [db eid] 21 | (first (transaction-times db eid))) 22 | 23 | (defn updated-at 24 | "Given a Datomic db snapshot and an entity id, returns the time when the entity 25 | was last updated (last transaction)." 26 | [db eid] 27 | (last (transaction-times db eid))) 28 | 29 | (defn seconds 30 | "Returns a printable number of fractional seconds based on input milliseconds." 31 | [millis] 32 | (float (/ millis 1000))) 33 | -------------------------------------------------------------------------------- /simulator/travis/prepare_simulation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ev 3 | 4 | export PROJECT_DIR=`pwd` 5 | 6 | lein deps 7 | 8 | ../travis/prepare.sh 9 | 10 | docker pull python:3 11 | -------------------------------------------------------------------------------- /simulator/travis/run_simulation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ev 3 | 4 | export PROJECT_DIR=`pwd` 5 | ${GITHUB_WORKSPACE}/travis/start_scheduler.sh 6 | 7 | cd ${PROJECT_DIR} 8 | lein run -c config/settings.edn setup-database -c travis/simulator_config.edn 9 | 10 | set +e 11 | lein run -c config/settings.edn travis -c travis/simulator_config.edn 12 | SIM_EXIT_CODE=$? 13 | 14 | if [ ${SIM_EXIT_CODE} -ne 0 ]; then 15 | echo "Displaying executor logs" 16 | ${GITHUB_WORKSPACE}/travis/show_executor_logs.sh 17 | fi 18 | 19 | exit ${SIM_EXIT_CODE} 20 | -------------------------------------------------------------------------------- /simulator/travis/scheduler_config.edn: -------------------------------------------------------------------------------- 1 | {:port 12321 2 | :hostname "172.17.0.1" 3 | :authorization {:http-basic true} 4 | :database {:datomic-uri "datomic:free://localhost:4334/cook-jobs"} 5 | :zookeeper {:local? true} 6 | :scheduler {:offer-incubate-ms 15000 7 | :task-constraints {:timeout-hours 1 8 | :timeout-interval-minutes 1 9 | :memory-gb 48 10 | :retry-limit 200 11 | :cpus 6}} 12 | :rebalancer {:interval-seconds 20 13 | :safe-dru-threshold 0.0 14 | ;; virtually ANY improvement in DRU balance should provoke preemption: 15 | :min-dru-diff 1E-309 16 | :max-preemption 64.0 17 | :dru-scale 1} 18 | 19 | :mesos {:master "zk://172.17.0.3:2181/mesos" ; minimesos zookeeper 20 | :failover-timeout-ms nil 21 | :leader-path "/cook-scheduler" 22 | :role "cook" 23 | :framework-id "cook-framework"} 24 | :compute-clusters [{:factory-fn cook.mesos.mesos-compute-cluster/factory-fn 25 | :config {:compute-cluster-name "default-compute-cluster-from-config-defaulting" 26 | :framework-id "cook-framework" 27 | :master "zk://172.17.0.3:2181/mesos" 28 | :failover-timeout nil 29 | :principal nil 30 | :role "cook" 31 | :framework-name nil}}] 32 | :unhandled-exceptions {:log-level :error} 33 | :metrics {:jmx true} 34 | :nrepl {:enabled? true 35 | :port 8888} 36 | :log {:file "log/cook.log" 37 | :levels {"datomic.db" :warn 38 | "datomic.peer" :warn 39 | "datomic.kv-cluster" :warn 40 | "cook.mesos.rebalancer" :debug 41 | :default :info}}} 42 | -------------------------------------------------------------------------------- /travis/build_cook_executor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | cd ${GITHUB_WORKSPACE}/executor 6 | pip install -r requirements.txt 7 | ./bin/prepare-executor.sh local ${GITHUB_WORKSPACE}/scheduler/resources/public 8 | tar -C ${GITHUB_WORKSPACE}/travis -xzf ./dist/cook-executor-local.tar.gz 9 | -------------------------------------------------------------------------------- /travis/gdrive_upload: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import base64 4 | import os 5 | import requests 6 | import sys 7 | import warnings 8 | 9 | if len(sys.argv) != 3: 10 | print('USAGE: {} JOB-ID XZ-FILE', sys.argv[0]) 11 | print('Upload an xz-compressed file to our Google Drive stash') 12 | sys.exit(1) 13 | 14 | tarball_path = sys.argv[2] 15 | 16 | # upload to google drive 17 | app_url = os.environ.get('GDRIVE_LOG_POST_URL') 18 | 19 | if not app_url: 20 | print('Missing application url. Please set GDRIVE_LOG_POST_URL in the environment.') 21 | sys.exit(1) 22 | 23 | with open(tarball_path, 'rb') as tarball: 24 | post_data = { 25 | 'job_id': sys.argv[1], 26 | 'tarball': base64.b64encode(tarball.read()) 27 | } 28 | 29 | with warnings.catch_warnings(): 30 | warnings.simplefilter('ignore') 31 | response = requests.post(app_url, data=post_data, timeout=10) 32 | 33 | print() 34 | print('==============================') 35 | print('== UPLOAD RESPONSE:') 36 | print('==============================') 37 | print(response.text) 38 | print('==============================') 39 | print() 40 | 41 | if not response.text.strip().endswith('successfully'): 42 | print('UPLOAD FAILED!') 43 | sys.exit(1) 44 | -------------------------------------------------------------------------------- /travis/install_mesos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PACKAGE_CACHE_DIR=$HOME/.apt-cache 4 | DISTRO=$(lsb_release -is | tr '[:upper:]' '[:lower:]') 5 | CODENAME=$(lsb_release -cs) 6 | 7 | if [ -d "$PACKAGE_CACHE_DIR" ] && [ -n "$(find $PACKAGE_CACHE_DIR -name 'mesos_*.deb')" ]; then 8 | echo 'Using cached Mesos library...' 9 | cp -f $PACKAGE_CACHE_DIR/*.deb /var/cache/apt/archives/ 10 | else 11 | echo 'Downloading Mesos library...' 12 | apt-key adv --keyserver keyserver.ubuntu.com --recv E56151BF 13 | echo "deb http://repos.mesosphere.io/${DISTRO} ${CODENAME} main" | sudo tee /etc/apt/sources.list.d/mesosphere.list 14 | apt-get update -qq 15 | apt-get install mesos -y --download-only 16 | mkdir -p $PACKAGE_CACHE_DIR/ 17 | cp -f /var/cache/apt/archives/*.deb $PACKAGE_CACHE_DIR/ 18 | fi 19 | 20 | set -x 21 | 22 | apt-get install --allow-downgrades --fix-broken --no-download --yes $PACKAGE_CACHE_DIR/*.deb 23 | APT_EXIT_CODE=$? 24 | 25 | if [ $APT_EXIT_CODE -ne 0 ] || ! [ -f $MESOS_NATIVE_JAVA_LIBRARY ]; then 26 | echo 'Mesos installation error!' 27 | exit $APT_EXIT_CODE 28 | fi 29 | -------------------------------------------------------------------------------- /travis/minimesos: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | MINIMESOS_TAG="0.14.20180710" 6 | PARAMS="--debug $@" 7 | MINIMESOS_CLI_IMAGE="twosigma/minimesos-cli" 8 | 9 | command_exists() { 10 | command -v "$@" > /dev/null 2>&1 11 | } 12 | 13 | DOCKER_VERSION=$(docker version --format "{{.Server.Version}}") 14 | SMALLEST_VERSION=$(printf "%s\n1.11.0\n" $DOCKER_VERSION | sort -t '.' -k 1,1 -k 2,2 -k 3,3 -k 4,4 -g | head -n 1) 15 | 16 | if ! command_exists docker || [ $SMALLEST_VERSION != "1.11.0" ]; then 17 | echo "Minimesos requires Docker 1.11.0 or higher" 18 | exit 1 19 | fi 20 | 21 | if [ "$DOCKER_HOST" != "" ] && [[ $DOCKER_HOST == tcp* ]]; then 22 | DOCKER_HOST_IP=$(echo "$DOCKER_HOST" | grep -o '[0-9]\+[.][0-9]\+[.][0-9]\+[.][0-9]\+') 23 | elif command_exists docker-machine && [ "$DOCKER_MACHINE_NAME" != "" ]; then 24 | DOCKER_HOST_IP=$(docker-machine ip ${DOCKER_MACHINE_NAME}) 25 | elif [ $(uname) != "Darwin" ]; then 26 | DOCKER_HOST_IP=$(ip addr show dev docker0 | grep inet | sed -r "s/.*inet\s([0-9\.]+)\/.*/\1/" | head -n 1) 27 | else 28 | DOCKER_HOST_IP="" 29 | fi 30 | 31 | pullImage() { 32 | if [ "$(docker images $1 | grep $2 2> /dev/null)" = "" ]; then 33 | echo "Pulling $1:$2" 34 | docker pull "$1:$2" 35 | fi 36 | } 37 | 38 | if [ "$#" -gt 0 -a "$1" = up ]; then 39 | pullImage ${MINIMESOS_CLI_IMAGE} ${MINIMESOS_TAG} 40 | fi 41 | 42 | if [ $(uname) == "Darwin" ]; then 43 | MINIMESOS_OS="Mac OS X" 44 | else 45 | MINIMESOS_OS="Linux" 46 | fi 47 | 48 | MINIMESOS_HOST_DIR="$(pwd)" 49 | MINIMESOS_DIR="$(pwd)/.minimesos" 50 | if [ ! -d "${MINIMESOS_DIR}" ]; then 51 | mkdir -p "${MINIMESOS_DIR}" 52 | echo "# Created minimesos directory at ${MINIMESOS_DIR}." 53 | fi 54 | 55 | docker run --rm -v "${MINIMESOS_HOST_DIR}":"${MINIMESOS_HOST_DIR}" \ 56 | -v /var/run/docker.sock:/var/run/docker.sock \ 57 | -v /sys/fs/cgroup:/sys/fs/cgroup \ 58 | -i \ 59 | --env DOCKER_HOST_IP=${DOCKER_HOST_IP} \ 60 | --env MINIMESOS_OS="${MINIMESOS_OS}" \ 61 | --entrypoint java \ 62 | ${MINIMESOS_CLI_IMAGE}:${MINIMESOS_TAG} \ 63 | -Dminimesos.host.dir="${MINIMESOS_HOST_DIR}" \ 64 | -jar /usr/local/share/minimesos/minimesos-cli.jar ${PARAMS} 65 | -------------------------------------------------------------------------------- /travis/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ev 3 | 4 | cd ${GITHUB_WORKSPACE}/scheduler 5 | lein deps 6 | lein uberjar 7 | VERSION=$(lein print :version | tr -d '"') 8 | 9 | cd ${GITHUB_WORKSPACE}/travis 10 | unzip ${GITHUB_WORKSPACE}/scheduler/datomic/datomic-free-0.9.5394.zip 11 | cp "${GITHUB_WORKSPACE}/scheduler/target/cook-${VERSION}.jar" datomic-free-0.9.5394/lib/ 12 | -------------------------------------------------------------------------------- /travis/show_executor_logs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -v 3 | 4 | echo "Printing out all executor logs..." 5 | while read path; do 6 | echo "Contents of ${path}"; 7 | cat "${path}"; 8 | echo "------------------------------------" 9 | done <<< "$(find ${GITHUB_WORKSPACE}/travis/.minimesos -name 'stdout' -o -name 'stderr' -o -name 'executor.log')" 10 | 11 | ${GITHUB_WORKSPACE}/travis/show_scheduler_logs.sh 12 | -------------------------------------------------------------------------------- /travis/show_scheduler_logs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | for log in ${GITHUB_WORKSPACE}/scheduler/log/cook*.log; 4 | do 5 | echo "Contents of ${log}" 6 | cat "${log}"; 7 | echo "------------------------------------" 8 | done 9 | -------------------------------------------------------------------------------- /travis/start_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ev 3 | 4 | cd ${GITHUB_WORKSPACE}/travis 5 | 6 | ./build_cook_executor.sh 7 | ./datomic-free-0.9.5394/bin/transactor ${GITHUB_WORKSPACE}/scheduler/datomic/datomic_transactor.properties & 8 | ./minimesos up 9 | 10 | cd ${GITHUB_WORKSPACE}/scheduler 11 | # on travis, ports on 172.17.0.1 are bindable from the host OS, and are also 12 | # available for processes inside minimesos containers to connect to 13 | LIBPROCESS_IP=172.17.0.1 lein run ${PROJECT_DIR}/travis/scheduler_config.edn & 14 | -------------------------------------------------------------------------------- /travis/upload_logs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cd ${GITHUB_WORKSPACE} 6 | 7 | # Create dump name 8 | repo=${GITHUB_REPOSITORY} 9 | pr_number=$(jq -r ".pull_request.number" "$GITHUB_EVENT_PATH") 10 | dump_name="${repo//\//-}-PR${pr_number}-${GITHUB_WORKFLOW// /-}-$GITHUB_RUN_ID" 11 | 12 | # List the last 10 containers 13 | docker ps --all --last 10 14 | 15 | # Grab the Mesos master logs 16 | mkdir -p ./mesos/master-logs 17 | mesos_master_container=$(docker ps --all --latest --filter "name=minimesos-master-" --format "{{.ID}}") 18 | docker cp --follow-link $mesos_master_container:/var/log/mesos-master.INFO ./mesos/master-logs/ 19 | docker cp --follow-link $mesos_master_container:/var/log/mesos-master.WARNING ./mesos/master-logs/ 20 | 21 | # Grab the Mesos agent logs 22 | mesos_agent_containers=$(docker ps --all --last 6 --filter "name=minimesos-agent-" --format "{{.ID}}") 23 | for container in $mesos_agent_containers; 24 | do 25 | destination=./mesos/agent-logs/$container 26 | mkdir -p $destination 27 | docker cp --follow-link $container:/var/log/mesos-slave.INFO $destination 28 | docker cp --follow-link $container:/var/log/mesos-slave.WARNING $destination 29 | docker cp --follow-link $container:/var/log/mesos-slave.ERROR $destination 30 | docker cp --follow-link $container:/var/log/mesos-fetcher.INFO $destination || echo "Container $container does not have mesos-fetcher.INFO" 31 | done 32 | 33 | tarball=./dump.txz 34 | tar -cJf $tarball --transform="s|\./[^/]*/\.*|${dump_name}/|" --warning=no-file-changed ./scheduler/log ./travis/.minimesos ./mesos/master-logs ./mesos/agent-logs || exitcode=$? 35 | # GNU tar always exits with 0, 1 or 2 (https://www.gnu.org/software/tar/manual/html_section/tar_19.html) 36 | # 0 = Successful termination 37 | # 1 = Some files differ (we're OK with this) 38 | # 2 = Fatal error 39 | if [ "$exitcode" == "2" ]; then 40 | echo "The tar command exited with exit code $exitcode, exiting..." 41 | exit $exitcode 42 | fi 43 | ./travis/gdrive_upload "travis-${dump_name}" $tarball 44 | --------------------------------------------------------------------------------