├── .git_archival.txt ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── ci-docker.yml │ ├── copilot-setup-steps.yml │ ├── machines.yml │ ├── mirror_gitee.yml │ ├── publish_conda.yml │ ├── pyright.yml │ ├── release.yml │ ├── test-bohrium.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── AGENTS.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── ci ├── LICENSE ├── README.md ├── pbs.sh ├── pbs │ ├── docker-compose.yml │ └── start-pbs.sh ├── slurm.sh ├── slurm │ ├── docker-compose.yml │ ├── register_cluster.sh │ └── start-slurm.sh ├── ssh.sh ├── ssh │ ├── docker-compose.yml │ └── start-ssh.sh └── ssh_rsync.sh ├── codecov.yml ├── conda ├── conda_build_config.yaml └── meta.yaml ├── doc ├── .gitignore ├── Makefile ├── batch.md ├── cli.rst ├── conf.py ├── context.md ├── credits.rst ├── dpdispatcher_on_yarn.md ├── env.md ├── examples │ ├── expanse.md │ ├── g16.md │ ├── shell.md │ └── template.md ├── getting-started.md ├── index.rst ├── install.md ├── machine.rst ├── make.bat ├── pep723.rst ├── requirements.txt ├── resources.rst ├── run.md └── task.rst ├── dpdispatcher ├── __init__.py ├── __main__.py ├── arginfo.py ├── base_context.py ├── contexts │ ├── __init__.py │ ├── dp_cloud_server_context.py │ ├── hdfs_context.py │ ├── lazy_local_context.py │ ├── local_context.py │ ├── openapi_context.py │ └── ssh_context.py ├── dlog.py ├── dpcloudserver │ ├── __init__.py │ └── client.py ├── dpdisp.py ├── entrypoints │ ├── __init__.py │ ├── gui.py │ ├── run.py │ └── submission.py ├── machine.py ├── machines │ ├── JH_UniScheduler.py │ ├── __init__.py │ ├── distributed_shell.py │ ├── dp_cloud_server.py │ ├── fugaku.py │ ├── lsf.py │ ├── openapi.py │ ├── pbs.py │ ├── shell.py │ └── slurm.py ├── run.py ├── submission.py └── utils │ ├── __init__.py │ ├── dpcloudserver │ ├── __init__.py │ ├── client.py │ ├── config.py │ ├── retcode.py │ └── zip_file.py │ ├── hdfs_cli.py │ ├── job_status.py │ ├── record.py │ └── utils.py ├── examples ├── dpdisp_run.py ├── machine │ ├── expanse.json │ ├── lazy_local.json │ ├── mandu.json │ └── ssh_proxy_command.json ├── resources │ ├── expanse_cpu.json │ ├── mandu.json │ ├── template.slurm │ └── tiger.json └── task │ ├── deepmd-kit.json │ └── g16.json ├── pyproject.toml ├── scripts ├── script_gen_dargs_docs.py └── script_gen_dargs_json.py └── tests ├── .gitignore ├── __init__.py ├── batch.json ├── context.py ├── debug_test_class_submission_init.py ├── devel_test_JH_UniScheduler.py ├── devel_test_ali_ehpc.py ├── devel_test_dp_cloud_server.py ├── devel_test_lazy_ali_ehpc.py ├── devel_test_lsf.py ├── devel_test_shell.py ├── devel_test_slurm.py ├── devel_test_ssh_ali_ehpc.py ├── graph.pb ├── hello_world.py ├── jsons ├── job.json ├── machine.json ├── machine_JH_UniScheduler.json ├── machine_ali_ehpc.json ├── machine_center.json ├── machine_diffenert.json ├── machine_dp_cloud_server.json ├── machine_fugaku.json ├── machine_if_cuda_multi_devices.json ├── machine_lazy_local_jh_unischeduler.json ├── machine_lazy_local_lsf.json ├── machine_lazy_local_slurm.json ├── machine_lazylocal_shell.json ├── machine_local_fugaku.json ├── machine_local_shell.json ├── machine_lsf.json ├── machine_openapi.json ├── machine_slurm.json ├── machine_yarn.json ├── resources.json ├── submission.json └── task.json ├── sample_class.py ├── script_gen_json.py ├── slurm_test.env ├── test_JH_UniScheduler_script_generation.py ├── test_argcheck.py ├── test_class_job.py ├── test_class_machine.py ├── test_class_machine_dispatch.py ├── test_class_resources.py ├── test_class_submission.py ├── test_class_submission_init.py ├── test_class_task.py ├── test_cli.py ├── test_context_dir └── 0_md │ ├── bct-1 │ ├── conf.lmp │ ├── input.lammps │ └── some_dir │ │ └── some_file │ ├── bct-2 │ ├── conf.lmp │ └── input.lammps │ ├── bct-3 │ ├── conf.lmp │ └── input.lammps │ ├── bct-4 │ ├── conf.lmp │ └── input.lammps │ ├── dir with space │ └── file with space │ ├── graph.pb │ └── some_dir │ └── some_file ├── test_examples.py ├── test_group_size.py ├── test_gui.py ├── test_hdfs_context.py ├── test_hdfs_dir └── 0_md │ ├── bct-1 │ ├── conf.lmp │ └── input.lammps │ ├── bct-2 │ ├── conf.lmp │ └── input.lammps │ ├── bct-3 │ ├── conf.lmp │ └── input.lammps │ ├── bct-4 │ ├── conf.lmp │ └── input.lammps │ └── graph.pb ├── test_if_cuda_multi_devices └── test_dir │ └── test.txt ├── test_import_classes.py ├── test_jh_unischeduler └── 0_md │ ├── bct-1 │ ├── conf.lmp │ └── input.lammps │ ├── bct-2 │ ├── conf.lmp │ └── input.lammps │ ├── bct-3 │ ├── conf.lmp │ └── input.lammps │ ├── bct-4 │ ├── conf.lmp │ └── input.lammps │ └── graph.pb ├── test_lazy_local_context.py ├── test_local_context.py ├── test_lsf_dir └── 0_md │ ├── bct-1 │ ├── conf.lmp │ └── input.lammps │ ├── bct-2 │ ├── conf.lmp │ └── input.lammps │ ├── bct-3 │ ├── conf.lmp │ └── input.lammps │ ├── bct-4 │ ├── conf.lmp │ └── input.lammps │ ├── graph.pb │ └── submission.json ├── test_lsf_script_generation.py ├── test_pbs_dir └── 0_md │ ├── bct-1 │ ├── conf.lmp │ └── input.lammps │ ├── bct-2 │ ├── conf.lmp │ └── input.lammps │ ├── bct-3 │ ├── conf.lmp │ └── input.lammps │ ├── bct-4 │ ├── conf.lmp │ └── input.lammps │ └── graph.pb ├── test_retry.py ├── test_rsync_flags.py ├── test_rsync_proxy.py ├── test_run.py ├── test_run_submission.py ├── test_run_submission_bohrium.py ├── test_run_submission_ratio_unfinished.py ├── test_shell_cuda_multi_devices.py ├── test_shell_trival.py ├── test_shell_trival_dir ├── fail_dir │ └── mock_fail_task.txt ├── parent_dir │ ├── dir with space │ │ └── example.txt │ ├── dir1 │ │ └── example.txt │ ├── dir2 │ │ └── example.txt │ ├── dir3 │ │ └── example.txt │ ├── dir4 │ │ └── example.txt │ └── graph.pb └── recover_dir │ └── mock_recover_task.txt ├── test_slurm_dir └── 0_md │ ├── bct-1 │ ├── conf.lmp │ └── input.lammps │ ├── bct-2 │ ├── conf.lmp │ └── input.lammps │ ├── bct-3 │ ├── conf.lmp │ └── input.lammps │ ├── bct-4 │ ├── conf.lmp │ └── input.lammps │ ├── d3c842c5b9476e48f7145b370cd330372b9293e1.json │ ├── graph.pb │ └── submission.json ├── test_slurm_script_generation.py ├── test_ssh_context.py ├── test_ssh_jump_host.py └── test_work_path └── .gitkeep /.git_archival.txt: -------------------------------------------------------------------------------- 1 | node: 4816095c9e711259877fb90023ce74ce527ba5c3 2 | node-date: 2025-10-13T18:04:52+08:00 3 | describe-name: v0.6.12-1-g4816095c 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | .git_archival.txt export-subst 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/ci-docker.yml: -------------------------------------------------------------------------------- 1 | name: Build docker image and push to Docker Hub 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | build-n-push: 10 | name: Build docker image and push to Docker Hub 11 | if: github.repository == 'deepmodeling/dpdispatcher' 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Check out the repo 15 | uses: actions/checkout@v5 16 | 17 | - name: Log in to Docker Hub 18 | uses: docker/login-action@v3 19 | with: 20 | username: ${{ secrets.DOCKER_USERNAME }} 21 | password: ${{ secrets.DOCKER_PASSWORD }} 22 | 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@v3 25 | 26 | - name: Set up Docker Buildx 27 | id: buildx 28 | uses: docker/setup-buildx-action@v3 29 | 30 | - name: Build and push 31 | run: | 32 | docker buildx build --platform linux/arm64,linux/amd64 -t dptechnology/dpdispatcher:${{ github.ref_name }} -t dptechnology/dpdispatcher:latest --push . 33 | -------------------------------------------------------------------------------- /.github/workflows/copilot-setup-steps.yml: -------------------------------------------------------------------------------- 1 | name: "Copilot Setup Steps" 2 | 3 | # Automatically run the setup steps when they are changed to allow for easy validation, and 4 | # allow manual testing through the repository's "Actions" tab 5 | on: 6 | workflow_dispatch: 7 | push: 8 | paths: 9 | - .github/workflows/copilot-setup-steps.yml 10 | pull_request: 11 | paths: 12 | - .github/workflows/copilot-setup-steps.yml 13 | 14 | jobs: 15 | # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot. 16 | copilot-setup-steps: 17 | runs-on: ubuntu-latest 18 | 19 | # Set the permissions to the lowest permissions possible needed for your steps. 20 | # Copilot will be given its own token for its operations. 21 | permissions: 22 | # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete. 23 | contents: read 24 | 25 | # You can define any steps you want, and they will run before the agent starts. 26 | # If you do not check out your code, Copilot will do this for you. 27 | steps: 28 | - name: Checkout code 29 | uses: actions/checkout@v5 30 | 31 | - name: Set up Python 32 | uses: actions/setup-python@v6 33 | with: 34 | python-version: "3.11" 35 | 36 | - name: Install uv 37 | run: | 38 | # Install uv using pip to avoid network restrictions 39 | python -m pip install --upgrade pip 40 | python -m pip install uv 41 | 42 | - name: Create virtual environment and install dependencies 43 | run: | 44 | uv venv .venv 45 | source .venv/bin/activate 46 | uv pip install .[test] coverage 47 | 48 | - name: Install development tools 49 | run: | 50 | uv tool install pre-commit 51 | uv tool install pyright 52 | 53 | - name: Set up pre-commit hooks 54 | run: | 55 | source .venv/bin/activate 56 | pre-commit install --install-hooks 57 | 58 | - name: Verify installation 59 | run: | 60 | source .venv/bin/activate 61 | python --version 62 | uv --version 63 | pre-commit --version 64 | pyright --version 65 | python -c "import dpdispatcher; print('DPDispatcher installed successfully')" 66 | -------------------------------------------------------------------------------- /.github/workflows/machines.yml: -------------------------------------------------------------------------------- 1 | name: Test on different machines 2 | 3 | 'on': 4 | push: 5 | branches-ignore: 6 | - 'copilot/**' 7 | - 'dependabot/**' 8 | - 'pre-commit-ci-update-config' 9 | pull_request: 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | machine: 17 | - slurm 18 | - pbs 19 | - ssh 20 | - ssh_rsync 21 | steps: 22 | - uses: actions/checkout@v5 23 | - run: ./ci/${{ matrix.machine }}.sh 24 | env: 25 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 26 | -------------------------------------------------------------------------------- /.github/workflows/mirror_gitee.yml: -------------------------------------------------------------------------------- 1 | name: Mirror to Gitee Repo 2 | 3 | on: [ push, delete, create ] 4 | 5 | # Ensures that only one mirror task will run at a time. 6 | concurrency: 7 | group: git-mirror 8 | 9 | jobs: 10 | git-mirror: 11 | uses: deepmodeling/workflows/.github/workflows/mirror_gitee.yml@main 12 | secrets: 13 | SYNC_GITEE_PRIVATE_KEY: ${{ secrets.SYNC_GITEE_PRIVATE_KEY }} 14 | -------------------------------------------------------------------------------- /.github/workflows/publish_conda.yml: -------------------------------------------------------------------------------- 1 | name: publish_conda 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v5 13 | - name: publish-to-conda 14 | uses: felix5572/conda-publish-action@v1.9 15 | with: 16 | subdir: 'conda' 17 | anacondatoken: ${{ secrets.ANACONDA_TOKEN }} 18 | platforms: 'noarch' 19 | -------------------------------------------------------------------------------- /.github/workflows/pyright.yml: -------------------------------------------------------------------------------- 1 | 'on': 2 | push: 3 | branches-ignore: 4 | - 'copilot/**' 5 | - 'dependabot/**' 6 | - 'pre-commit-ci-update-config' 7 | pull_request: 8 | name: Type checker 9 | jobs: 10 | test: 11 | name: pyright 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v5 15 | - uses: actions/setup-python@v6 16 | with: 17 | python-version: '3.11' 18 | - run: pip install uv 19 | - run: uv pip install --system -e .[cloudserver,gui] 20 | - uses: jakebailey/pyright-action@v2 21 | with: 22 | version: 1.1.404 23 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | 'on': 2 | push: 3 | branches-ignore: 4 | - 'copilot/**' 5 | - 'dependabot/**' 6 | - 'pre-commit-ci-update-config' 7 | tags: 8 | - 'v*' 9 | pull_request: 10 | name: Release to pypi 11 | jobs: 12 | release-to-pypi: 13 | name: Release to pypi 14 | runs-on: ubuntu-latest 15 | permissions: 16 | # IMPORTANT: this permission is mandatory for trusted publishing 17 | id-token: write 18 | steps: 19 | - uses: actions/checkout@v5 20 | - name: Setup python 21 | uses: actions/setup-python@v6 22 | with: 23 | python-version: 3.x 24 | architecture: x64 25 | - name: Install dependencies 26 | run: python -m pip install build 27 | - run: python -m build 28 | - name: Publish a Python distribution to PyPI 29 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | with: 32 | verbose: true 33 | -------------------------------------------------------------------------------- /.github/workflows/test-bohrium.yml: -------------------------------------------------------------------------------- 1 | name: Test Bohrium 2 | 3 | 'on': 4 | push: 5 | branches-ignore: 6 | - 'copilot/**' 7 | - 'dependabot/**' 8 | - 'pre-commit-ci-update-config' 9 | pull_request_target: 10 | types: 11 | - "labeled" 12 | 13 | jobs: 14 | test: 15 | runs-on: ubuntu-latest 16 | environment: bohrium 17 | if: github.repository_owner == 'deepmodeling' && (github.event.label.name == 'Test Bohrium' || github.event_name == 'push') 18 | steps: 19 | - uses: actions/checkout@v5 20 | with: 21 | ref: "${{ github.event.pull_request.merge_commit_sha }}" 22 | - name: Set up Python 3.12 23 | uses: actions/setup-python@v6 24 | with: 25 | python-version: '3.12' 26 | cache: 'pip' 27 | - run: pip install uv 28 | - run: uv pip install --system .[bohrium] coverage 29 | - name: Test 30 | run: coverage run --source=./dpdispatcher -m unittest -v tests/test_run_submission_bohrium.py && coverage report 31 | env: 32 | DPDISPATCHER_TEST: bohrium 33 | BOHRIUM_EMAIL: ${{ secrets.BOHRIUM_EMAIL }} 34 | BOHRIUM_PASSWORD: ${{ secrets.BOHRIUM_PASSWORD }} 35 | BOHRIUM_PROJECT_ID: ${{ secrets.BOHRIUM_PROJECT_ID }} 36 | BOHRIUM_ACCESS_KEY: ${{ secrets.BOHRIUM_ACCESS_KEY }} 37 | - uses: codecov/codecov-action@v5 38 | env: 39 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 40 | remove_label: 41 | permissions: 42 | contents: read 43 | pull-requests: write 44 | # so one can re-trigger the workflow without manually removing the label 45 | runs-on: ubuntu-latest 46 | if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test Bohrium' 47 | steps: 48 | - uses: actions-ecosystem/action-remove-labels@v1 49 | with: 50 | labels: Test Bohrium 51 | number: ${{ github.event.pull_request.number }} 52 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | 'on': 4 | push: 5 | branches-ignore: 6 | - 'copilot/**' 7 | - 'dependabot/**' 8 | - 'pre-commit-ci-update-config' 9 | pull_request: 10 | 11 | jobs: 12 | test: 13 | runs-on: ${{ matrix.platform }} 14 | strategy: 15 | matrix: 16 | python-version: 17 | - 3.7 18 | - 3.9 19 | - '3.10' 20 | - '3.11' 21 | - '3.12' 22 | platform: 23 | - ubuntu-22.04 24 | - macos-latest 25 | - windows-latest 26 | exclude: # Apple Silicon ARM64 does not support Python < v3.8 27 | - python-version: "3.7" 28 | platform: macos-latest 29 | include: # So run those legacy versions on Intel CPUs 30 | - python-version: "3.7" 31 | platform: macos-15-intel 32 | steps: 33 | - uses: actions/checkout@v5 34 | - name: Set up Python ${{ matrix.python-version }} 35 | uses: actions/setup-python@v6 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | - uses: astral-sh/setup-uv@v7 39 | with: 40 | enable-cache: true 41 | cache-dependency-glob: | 42 | **/requirements*.txt 43 | **/pyproject.toml 44 | - run: uv pip install --system .[test] coverage 45 | - name: Test 46 | run: | 47 | python -m coverage run -p --source=./dpdispatcher -m unittest -v 48 | python -m coverage run -p --source=./dpdispatcher -m dpdispatcher -h 49 | python -m coverage combine 50 | python -m coverage report 51 | - uses: codecov/codecov-action@v5 52 | env: 53 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 54 | pass: 55 | needs: [test] 56 | runs-on: ubuntu-latest 57 | if: always() 58 | steps: 59 | - name: Decide whether the needed jobs succeeded or failed 60 | uses: re-actors/alls-green@release/v1 61 | with: 62 | jobs: ${{ toJSON(needs) }} 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | POTCAR 2 | # *.pb 3 | tests/graph_real.pb 4 | *~ 5 | *.d 6 | *.o 7 | *.aux 8 | *.dvi 9 | *.pdf 10 | *.so 11 | *.bin 12 | *.intbin 13 | *.meta 14 | *.log 15 | *.bz2 16 | *.pyc 17 | \#* 18 | iter.* 19 | 20 | # out.txt 21 | topol.tpr 22 | mdout.mdp 23 | traj*xtc 24 | traj.trr 25 | ener.edr 26 | state*cpt 27 | CMakeCache.txt 28 | CMakeFiles 29 | log.lammps 30 | restart.* 31 | dump.* 32 | *.out 33 | build 34 | dist 35 | pydispatcher.egg-info 36 | */*.pyc 37 | */__pycache__ 38 | *.swp 39 | .eggs 40 | .coverage* 41 | dbconfig.json 42 | .vscode/* 43 | .idea 44 | */_version.py 45 | */_date.py 46 | *.egg 47 | *.egg-info 48 | venv/* 49 | node_modules/ 50 | # Test artifacts 51 | *_flag_if_job_task_fail 52 | *_job_id 53 | *_job_tag_finished 54 | *_task_tag_finished 55 | *.sub 56 | *.sub.run 57 | script_*.py 58 | # Job execution temporary files 59 | err 60 | log 61 | # JSON files with hash names (job state files) 62 | [0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]*.json 63 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v6.0.0 6 | hooks: 7 | - id: trailing-whitespace 8 | exclude: "^tests/" 9 | - id: end-of-file-fixer 10 | exclude: "^tests/" 11 | - id: check-yaml 12 | exclude: "^conda/" 13 | - id: check-json 14 | - id: check-added-large-files 15 | - id: check-merge-conflict 16 | - id: check-symlinks 17 | - id: check-toml 18 | # Python 19 | - repo: https://github.com/astral-sh/ruff-pre-commit 20 | # Ruff version. 21 | rev: v0.12.8 22 | hooks: 23 | - id: ruff 24 | args: ["--fix"] 25 | - id: ruff-format 26 | # numpydoc 27 | - repo: https://github.com/Carreau/velin 28 | rev: 0.0.12 29 | hooks: 30 | - id: velin 31 | args: ["--write"] 32 | # Python inside docs 33 | - repo: https://github.com/asottile/blacken-docs 34 | rev: 1.19.1 35 | hooks: 36 | - id: blacken-docs 37 | # markdown, yaml 38 | - repo: https://github.com/pre-commit/mirrors-prettier 39 | rev: v4.0.0-alpha.8 40 | hooks: 41 | - id: prettier 42 | types_or: [markdown, yaml] 43 | # workflow files cannot be modified by pre-commit.ci 44 | exclude: ^(\.github/workflows|conda) 45 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | jobs: 14 | post_create_environment: 15 | - pip install uv 16 | post_install: 17 | - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH uv pip install .[docs] 18 | # Build documentation in the docs/ directory with Sphinx 19 | sphinx: 20 | configuration: doc/conf.py 21 | 22 | # If using Sphinx, optionally build your docs in additional formats such as PDF 23 | formats: all 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## How to contribute 2 | 3 | DPDispatcher welcomes every people (or organization) to use under the LGPL-3.0 License. 4 | 5 | And Contributions are welcome and are greatly appreciated! Every little bit helps, and credit will always be given. 6 | 7 | If you want to contribute to dpdispatcher, just open a issue, submiit a pull request , leave a comment on github discussion, or contact deepmodeling team. 8 | 9 | Any forms of improvement are welcome. 10 | 11 | - use, star or fork dpdispatcher 12 | - improve the documents 13 | - report or fix bugs 14 | - request, discuss or implement features 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12 AS compile-image 2 | 3 | RUN python -m venv /opt/venv 4 | # Make sure we use the virtualenv 5 | ENV PATH="/opt/venv/bin:$PATH" 6 | 7 | WORKDIR /data/dpdispatcher 8 | COPY ./ ./ 9 | RUN pip install .[bohrium] 10 | 11 | FROM python:3.12 AS build-image 12 | COPY --from=compile-image /opt/venv /opt/venv 13 | ENV PATH="/opt/venv/bin:$PATH" 14 | CMD ["/bin/bash"] 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DPDispatcher 2 | 3 | [![conda-forge](https://img.shields.io/conda/dn/conda-forge/dpdispatcher?color=red&label=conda-forge&logo=conda-forge)](https://anaconda.org/conda-forge/dpdispatcher) 4 | [![pip install](https://img.shields.io/pypi/dm/dpdispatcher?label=pip%20install&logo=pypi)](https://pypi.org/project/dpdispatcher) 5 | [![docker pull](https://img.shields.io/docker/pulls/dptechnology/dpdispatcher?logo=docker)](https://hub.docker.com/r/dptechnology/dpdispatcher) 6 | [![Documentation Status](https://readthedocs.org/projects/dpdispatcher/badge/)](https://dpdispatcher.readthedocs.io/) 7 | 8 | DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish. 9 | 10 | DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH). 11 | 12 | For more information, check the [documentation](https://dpdispatcher.readthedocs.io/). 13 | 14 | ## Installation 15 | 16 | DPDispatcher can be installed by `pip`: 17 | 18 | ```bash 19 | pip install dpdispatcher 20 | ``` 21 | 22 | To add [Bohrium](https://bohrium.dp.tech/) support, execute 23 | 24 | ```bash 25 | pip install dpdispatcher[bohrium] 26 | ``` 27 | 28 | ## Usage 29 | 30 | See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-started.html) for usage. 31 | 32 | ## Contributing 33 | 34 | DPDispatcher is maintained by Deep Modeling's developers and welcomes other people. 35 | See [Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓 36 | 37 | ## References 38 | 39 | DPDispatcher is derived from the [DP-GEN](https://github.com/deepmodeling/dpgen) package. To mention DPDispatcher in a scholarly publication, please read Section 3.3 in the [DP-GEN paper](https://doi.org/10.1016/j.cpc.2020.107206). 40 | -------------------------------------------------------------------------------- /ci/LICENSE: -------------------------------------------------------------------------------- 1 | The files about slurm and pbs in this directory is originally under the following licenses: 2 | 3 | Copyright (c) 2018, Anaconda, Inc. and contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 9 | Redistributions of source code must retain the above copyright notice, 10 | this list of conditions and the following disclaimer. 11 | 12 | Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | Neither the name of Anaconda nor the names of any contributors may be used to 17 | endorse or promote products derived from this software without specific prior 18 | written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 24 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 30 | THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /ci/README.md: -------------------------------------------------------------------------------- 1 | ## Notes 2 | 3 | Files about `slurm` and `pbs` in this directory is originally taken from [dask/dask-jobqueue](https://github.com/dask/dask-jobqueue) under [BSD 3-Clause "New" or "Revised" License](LICENSE). 4 | They have been relicensed under [LPGL 3.0](../LICENSE) as [they are compatible](https://www.gnu.org/licenses/license-list.html#ModifiedBSD). 5 | -------------------------------------------------------------------------------- /ci/pbs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | cd ./ci/pbs 5 | docker compose pull 6 | ./start-pbs.sh 7 | cd - 8 | 9 | docker exec pbs_master /bin/bash -c "chmod -R 777 /shared_space" 10 | docker exec pbs_master /bin/bash -c "chown -R pbsuser:pbsuser /home/pbsuser" 11 | 12 | docker exec pbs_master /bin/bash -c "yum install -y procps" 13 | docker exec pbs_master /bin/bash -c "cd /dpdispatcher && pip install uv && uv pip install --system .[test] coverage && chown -R pbsuser ." 14 | docker exec -u pbsuser pbs_master /bin/bash -c "cd /dpdispatcher && coverage run --source=./dpdispatcher -m unittest -v && coverage report" 15 | docker exec -u pbsuser --env-file <(env | grep GITHUB) pbs_master /bin/bash -c "cd /dpdispatcher && curl -Os https://uploader.codecov.io/latest/linux/codecov && chmod +x codecov && ./codecov" 16 | -------------------------------------------------------------------------------- /ci/pbs/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | services: 4 | master: 5 | image: daskdev/dask-jobqueue:pbs 6 | build: . 7 | container_name: pbs_master 8 | hostname: pbs_master 9 | environment: 10 | - CI_SHARED_SPACE=/shared_space 11 | - DPDISPATCHER_TEST=pbs 12 | volumes: 13 | - ../..:/dpdispatcher 14 | - userhome:/home/pbsuser 15 | - shared_space:/shared_space 16 | command: bash /run-master.sh 17 | 18 | slave_one: 19 | image: daskdev/dask-jobqueue:pbs 20 | build: . 21 | container_name: pbs_slave_1 22 | hostname: pbs_slave_1 23 | volumes: 24 | - userhome:/home/pbsuser 25 | - shared_space:/shared_space 26 | entrypoint: "bash /slave-entrypoint.sh" 27 | command: bash /run-slave.sh 28 | links: 29 | - "master:pbs_master" 30 | environment: 31 | - PBS_MASTER=pbs_master 32 | depends_on: 33 | - master 34 | 35 | slave_two: 36 | image: daskdev/dask-jobqueue:pbs 37 | build: . 38 | container_name: pbs_slave_2 39 | hostname: pbs_slave_2 40 | volumes: 41 | - userhome:/home/pbsuser 42 | - shared_space:/shared_space 43 | entrypoint: "bash /slave-entrypoint.sh" 44 | command: bash /run-slave.sh 45 | links: 46 | - "master:pbs_master" 47 | environment: 48 | - PBS_MASTER=pbs_master 49 | depends_on: 50 | - master 51 | 52 | volumes: 53 | userhome: 54 | shared_space: 55 | -------------------------------------------------------------------------------- /ci/pbs/start-pbs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker compose up -d --no-build 4 | while [ `docker exec -u pbsuser pbs_master pbsnodes -a | grep "Mom = pbs_slave" | wc -l` -ne 2 ] 5 | do 6 | echo "Waiting for PBS slave nodes to become available"; 7 | sleep 2 8 | done 9 | echo "PBS properly configured" 10 | -------------------------------------------------------------------------------- /ci/slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | cd ./ci/slurm 5 | docker compose pull 6 | ./start-slurm.sh 7 | cd - 8 | 9 | docker exec slurmctld /bin/bash -c "yum install -y procps" 10 | docker exec slurmctld /bin/bash -c "cd dpdispatcher && pip install uv && uv pip install --system .[test] coverage && coverage run --source=./dpdispatcher -m unittest -v && coverage report" 11 | docker exec --env-file <(env | grep -e GITHUB -e CODECOV) slurmctld /bin/bash -c "cd dpdispatcher && curl -Os https://uploader.codecov.io/latest/linux/codecov && chmod +x codecov && ./codecov" 12 | -------------------------------------------------------------------------------- /ci/slurm/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.2" 2 | 3 | services: 4 | mysql: 5 | image: mysql:5.7.29 6 | hostname: mysql 7 | container_name: mysql 8 | environment: 9 | MYSQL_RANDOM_ROOT_PASSWORD: "yes" 10 | MYSQL_DATABASE: slurm_acct_db 11 | MYSQL_USER: slurm 12 | MYSQL_PASSWORD: password 13 | volumes: 14 | - var_lib_mysql:/var/lib/mysql 15 | networks: 16 | common-network: 17 | 18 | slurmdbd: 19 | image: daskdev/dask-jobqueue:slurm 20 | build: . 21 | command: ["slurmdbd"] 22 | container_name: slurmdbd 23 | hostname: slurmdbd 24 | volumes: 25 | - etc_munge:/etc/munge 26 | - etc_slurm:/etc/slurm 27 | - var_log_slurm:/var/log/slurm 28 | expose: 29 | - "6819" 30 | depends_on: 31 | - mysql 32 | networks: 33 | common-network: 34 | 35 | slurmctld: 36 | image: daskdev/dask-jobqueue:slurm 37 | build: . 38 | command: ["slurmctld"] 39 | container_name: slurmctld 40 | hostname: slurmctld 41 | environment: 42 | CI_SHARED_SPACE: /data 43 | DPDISPATCHER_TEST: slurm 44 | volumes: 45 | - etc_munge:/etc/munge 46 | - etc_slurm:/etc/slurm 47 | - slurm_jobdir:/data 48 | - var_log_slurm:/var/log/slurm 49 | - ../..:/dpdispatcher 50 | expose: 51 | - "6817" 52 | depends_on: 53 | - "slurmdbd" 54 | networks: 55 | common-network: 56 | ipv4_address: 10.1.1.10 57 | cap_add: 58 | - NET_ADMIN 59 | 60 | c1: 61 | image: daskdev/dask-jobqueue:slurm 62 | build: . 63 | command: ["slurmd"] 64 | hostname: c1 65 | container_name: c1 66 | volumes: 67 | - etc_munge:/etc/munge 68 | - etc_slurm:/etc/slurm 69 | - slurm_jobdir:/data 70 | - var_log_slurm:/var/log/slurm 71 | expose: 72 | - "6818" 73 | depends_on: 74 | - "slurmctld" 75 | networks: 76 | common-network: 77 | ipv4_address: 10.1.1.11 78 | cap_add: 79 | - NET_ADMIN 80 | 81 | c2: 82 | image: daskdev/dask-jobqueue:slurm 83 | build: . 84 | command: ["slurmd"] 85 | hostname: c2 86 | container_name: c2 87 | volumes: 88 | - etc_munge:/etc/munge 89 | - etc_slurm:/etc/slurm 90 | - slurm_jobdir:/data 91 | - var_log_slurm:/var/log/slurm 92 | expose: 93 | - "6818" 94 | depends_on: 95 | - "slurmctld" 96 | networks: 97 | common-network: 98 | ipv4_address: 10.1.1.12 99 | cap_add: 100 | - NET_ADMIN 101 | 102 | volumes: 103 | etc_munge: 104 | etc_slurm: 105 | slurm_jobdir: 106 | var_lib_mysql: 107 | var_log_slurm: 108 | 109 | networks: 110 | common-network: 111 | driver: bridge 112 | ipam: 113 | driver: default 114 | config: 115 | - subnet: 10.1.1.0/24 116 | -------------------------------------------------------------------------------- /ci/slurm/register_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \ 5 | docker compose restart slurmdbd slurmctld 6 | -------------------------------------------------------------------------------- /ci/slurm/start-slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker compose up -d --no-build 4 | 5 | while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] 6 | do 7 | echo "Waiting for SLURM cluster to become ready"; 8 | sleep 2 9 | done 10 | echo "SLURM properly configured" 11 | 12 | # On some clusters the login node does not have the same interface as the 13 | # compute nodes. The next three lines allow to test this edge case by adding 14 | # separate interfaces on the worker and on the scheduler nodes. 15 | docker exec slurmctld ip addr add 10.1.1.20/24 dev eth0 label eth0:scheduler 16 | docker exec c1 ip addr add 10.1.1.21/24 dev eth0 label eth0:worker 17 | docker exec c2 ip addr add 10.1.1.22/24 dev eth0 label eth0:worker 18 | -------------------------------------------------------------------------------- /ci/ssh.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | cd ./ci/ssh 5 | docker compose pull 6 | ./start-ssh.sh 7 | cd - 8 | 9 | docker exec test /bin/bash -c "cd /dpdispatcher && pip install uv && uv pip install --system .[test] coverage && coverage run --source=./dpdispatcher -m unittest -v && coverage report" 10 | docker exec --env-file <(env | grep -e GITHUB -e CODECOV) test /bin/bash -c "cd /dpdispatcher && curl -Os https://uploader.codecov.io/latest/linux/codecov && chmod +x codecov && ./codecov" 11 | -------------------------------------------------------------------------------- /ci/ssh/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.2" 2 | 3 | services: 4 | server: 5 | image: takeyamajp/ubuntu-sshd:ubuntu22.04 6 | build: . 7 | container_name: server 8 | hostname: server 9 | environment: 10 | ROOT_PASSWORD: dpdispatcher 11 | expose: 12 | - "22" 13 | volumes: 14 | - ssh_config:/root/.ssh 15 | jumphost: 16 | image: takeyamajp/ubuntu-sshd:ubuntu22.04 17 | build: . 18 | container_name: jumphost 19 | hostname: jumphost 20 | environment: 21 | ROOT_PASSWORD: dpdispatcher 22 | expose: 23 | - "22" 24 | volumes: 25 | - ssh_config:/root/.ssh 26 | test: 27 | image: python:3.10 28 | tty: true 29 | build: . 30 | container_name: test 31 | hostname: test 32 | environment: 33 | DPDISPATCHER_TEST: ssh 34 | volumes: 35 | - ssh_config:/root/.ssh 36 | - ../..:/dpdispatcher 37 | depends_on: 38 | - server 39 | - jumphost 40 | 41 | volumes: 42 | ssh_config: 43 | -------------------------------------------------------------------------------- /ci/ssh/start-ssh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker compose up -d --no-build 4 | 5 | # Set up SSH keys on server 6 | docker exec server /bin/bash -c "ssh-keygen -b 2048 -t rsa -f /root/.ssh/id_rsa -q -N \"\" && cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys" 7 | docker exec server /bin/bash -c "mkdir -p /dpdispatcher_working" 8 | docker exec server /bin/bash -c "mkdir -p /tmp/rsync_test" 9 | 10 | # Set up SSH keys on jumphost and configure it to access server 11 | docker exec jumphost /bin/bash -c "ssh-keygen -b 2048 -t rsa -f /root/.ssh/id_rsa -q -N \"\" && cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys" 12 | 13 | # Copy keys between containers to enable jump host functionality 14 | # Get the public key from jumphost and add it to server's authorized_keys 15 | docker exec jumphost /bin/bash -c "cat /root/.ssh/id_rsa.pub" | docker exec -i server /bin/bash -c "cat >> /root/.ssh/authorized_keys" 16 | 17 | # Get the public key from test (which shares volume with server) and add it to jumphost authorized_keys 18 | docker exec test /bin/bash -c "cat /root/.ssh/id_rsa.pub" | docker exec -i jumphost /bin/bash -c "cat >> /root/.ssh/authorized_keys" 19 | 20 | # Configure SSH client settings for known hosts to avoid host key verification 21 | docker exec test /bin/bash -c "echo 'StrictHostKeyChecking no' >> /root/.ssh/config && echo 'UserKnownHostsFile /dev/null' >> /root/.ssh/config" 22 | docker exec jumphost /bin/bash -c "echo 'StrictHostKeyChecking no' >> /root/.ssh/config && echo 'UserKnownHostsFile /dev/null' >> /root/.ssh/config" 23 | docker exec server /bin/bash -c "echo 'StrictHostKeyChecking no' >> /root/.ssh/config && echo 'UserKnownHostsFile /dev/null' >> /root/.ssh/config" 24 | 25 | # Install rsync on all containers 26 | docker exec test /bin/bash -c "apt-get -y update && apt-get -y install rsync" 27 | docker exec jumphost /bin/bash -c "apt-get -y update && apt-get -y install rsync" 28 | docker exec server /bin/bash -c "apt-get -y update && apt-get -y install rsync" 29 | -------------------------------------------------------------------------------- /ci/ssh_rsync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | cd ./ci/ssh 5 | docker compose pull 6 | ./start-ssh.sh 7 | cd - 8 | 9 | # install rsync 10 | docker exec server /bin/bash -c "apt-get -y update && apt-get -y install rsync" 11 | docker exec test /bin/bash -c "apt-get -y update && apt-get -y install rsync" 12 | 13 | docker exec test /bin/bash -c "cd /dpdispatcher && pip install uv && uv pip install --system .[test] coverage && coverage run --source=./dpdispatcher -m unittest -v && coverage report" 14 | docker exec --env-file <(env | grep -e GITHUB -e CODECOV) test /bin/bash -c "cd /dpdispatcher && curl -Os https://uploader.codecov.io/latest/linux/codecov && chmod +x codecov && ./codecov" 15 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "tests" 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | threshold: 100% 8 | patch: 9 | default: 10 | threshold: 100% 11 | -------------------------------------------------------------------------------- /conda/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | channel_sources: 2 | - defaults 3 | - conda-forge 4 | channel_targets: 5 | - deepmodeling 6 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "dpdispatcher" %} 2 | {% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %} 3 | 4 | package: 5 | name: {{ name|lower }} 6 | version: {{ version }} 7 | 8 | source: 9 | git_url: https://github.com/deepmodeling/dpdispatcher 10 | # git_rev: {{ version }} 11 | 12 | build: 13 | number: 0 14 | noarch: python 15 | script: python -m pip install --no-deps --ignore-installed . 16 | 17 | requirements: 18 | build: 19 | - git 20 | host: 21 | - git 22 | - python >=3.6 23 | - pip 24 | - setuptools_scm 25 | - dargs 26 | - paramiko 27 | - requests 28 | - tqdm 29 | 30 | run: 31 | - python >=3.6 32 | - dargs 33 | - paramiko 34 | - requests 35 | - tqdm 36 | 37 | test: 38 | imports: 39 | - dpdispatcher 40 | 41 | about: 42 | home: https://github.com/deepmodeling/dpdispatcher 43 | license: LGPL-3.0 44 | license_family: LGPL 45 | license_file: LICENSE 46 | doc_url: https://github.com/deepmodeling/dpdispatcher 47 | dev_url: https://github.com/deepmodeling/dpdispatcher 48 | 49 | extra: 50 | recipe-maintainers: 51 | - felix5572 52 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | # sphinx build folder 2 | _build 3 | 4 | # Compiled source # 5 | ################### 6 | *.com 7 | *.class 8 | *.dll 9 | *.exe 10 | *.o 11 | *.so 12 | 13 | # Packages # 14 | ############ 15 | # it's better to unpack these files and commit the raw source 16 | # git has its own built in compression methods 17 | *.7z 18 | *.dmg 19 | *.gz 20 | *.iso 21 | *.jar 22 | *.rar 23 | *.tar 24 | *.zip 25 | 26 | # Logs and databases # 27 | ###################### 28 | *.log 29 | *.sql 30 | *.sqlite 31 | 32 | # OS generated files # 33 | ###################### 34 | .DS_Store? 35 | ehthumbs.db 36 | Icon? 37 | Thumbs.db 38 | 39 | # Editor backup files # 40 | ####################### 41 | *~ 42 | # generated automatically 43 | api/ 44 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/cli.rst: -------------------------------------------------------------------------------- 1 | .. _cli: 2 | 3 | Command line interface 4 | ====================== 5 | 6 | .. argparse:: 7 | :module: dpdispatcher.dpdisp 8 | :func: main_parser 9 | :prog: dpdisp 10 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | from datetime import date 16 | 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "DPDispatcher" 23 | copyright = f"2020-{date.today().year}, Deep Modeling" 24 | author = "DeepModeling" 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | "deepmodeling_sphinx", 34 | "dargs.sphinx", 35 | "myst_parser", 36 | "sphinx_book_theme", 37 | "sphinx.ext.viewcode", 38 | "sphinx.ext.intersphinx", 39 | "numpydoc", 40 | "sphinx.ext.autosummary", 41 | "sphinxarg.ext", 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ["_templates"] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 51 | 52 | 53 | # -- Options for HTML output ------------------------------------------------- 54 | 55 | # The theme to use for HTML and HTML Help pages. See the documentation for 56 | # a list of builtin themes. 57 | # 58 | html_theme = "sphinx_book_theme" 59 | 60 | # Add any paths that contain custom static files (such as style sheets) here, 61 | # relative to this directory. They are copied after the builtin static files, 62 | # so a file named "default.css" will overwrite the builtin "default.css". 63 | html_static_path = ["_static"] 64 | html_css_files = [] 65 | 66 | autodoc_default_flags = ["members"] 67 | autosummary_generate = True 68 | master_doc = "index" 69 | 70 | 71 | def run_apidoc(_): 72 | from sphinx.ext.apidoc import main 73 | 74 | sys.path.append(os.path.join(os.path.dirname(__file__), "..")) 75 | cur_dir = os.path.abspath(os.path.dirname(__file__)) 76 | module = os.path.join(cur_dir, "..", "dpdispatcher") 77 | main( 78 | [ 79 | "-M", 80 | "--tocfile", 81 | "api", 82 | "-H", 83 | "DPDispatcher API", 84 | "-o", 85 | os.path.join(cur_dir, "api"), 86 | module, 87 | "--force", 88 | ] 89 | ) 90 | 91 | 92 | def setup(app): 93 | app.connect("builder-inited", run_apidoc) 94 | 95 | 96 | intersphinx_mapping = { 97 | "python": ("https://docs.python.org/", None), 98 | "dargs": ("https://docs.deepmodeling.com/projects/dargs/en/latest/", None), 99 | } 100 | 101 | myst_enable_extensions = [ 102 | "colon_fence", 103 | ] 104 | -------------------------------------------------------------------------------- /doc/credits.rst: -------------------------------------------------------------------------------- 1 | Authors 2 | ======= 3 | 4 | .. git-shortlog-authors:: 5 | -------------------------------------------------------------------------------- /doc/env.md: -------------------------------------------------------------------------------- 1 | # Environment variables 2 | 3 | When launching a job, dpdispatcher sets the following environment variables according to the resources, in addition to user-defined environment variables: 4 | 5 | :::{envvar} DPDISPATCHER_NUMBER_NODE 6 | 7 | The number of nodes required for each job. 8 | 9 | ::: 10 | 11 | :::{envvar} DPDISPATCHER_CPU_PER_NODE 12 | 13 | CPU numbers of each node assigned to each job. 14 | 15 | ::: 16 | 17 | :::{envvar} DPDISPATCHER_GPU_PER_NODE 18 | 19 | GPU numbers of each node assigned to each job. 20 | 21 | ::: 22 | 23 | :::{envvar} DPDISPATCHER_QUEUE_NAME 24 | 25 | The queue name of batch job scheduler system. 26 | 27 | ::: 28 | 29 | :::{envvar} DPDISPATCHER_GROUP_SIZE 30 | 31 | The number of tasks in a job. 0 means infinity. 32 | 33 | ::: 34 | 35 | These environment variables can be used in the {dargs:argument}`command `, for example, `mpirun -n ${DPDISPATCHER_CPU_PER_NODE} xx.run`. 36 | -------------------------------------------------------------------------------- /doc/examples/expanse.md: -------------------------------------------------------------------------------- 1 | # Running the DeePMD-kit on the Expanse cluster 2 | 3 | [Expanse](https://www.sdsc.edu/support/user_guides/expanse.html) is a cluster operated by the San Diego Supercomputer Center. Here we provide an example to run jobs on the expanse. 4 | 5 | The machine parameters are provided below. Expanse uses the SLURM workload manager for job scheduling. {ref}`remote_root ` has been created in advance. It's worth metioned that we do not recommend to use the password, so [SSH keys](https://www.ssh.com/academy/ssh/key) are used instead to improve security. 6 | 7 | ```{literalinclude} ../../examples/machine/expanse.json 8 | :language: json 9 | :linenos: 10 | ``` 11 | 12 | Expanse's standard compute nodes are each powered by two 64-core AMD EPYC 7742 processors and contain 256 GB of DDR4 memory. Here, we request one node with 32 cores and 16 GB memory from the `shared` partition. Expanse does not support `--gres=gpu:0` command, so we use {ref}`custom_gpu_line ` to customize the statement. 13 | 14 | ```{literalinclude} ../../examples/resources/expanse_cpu.json 15 | :language: json 16 | :linenos: 17 | ``` 18 | 19 | The following task parameter runs a DeePMD-kit task, forwarding an input file and backwarding graph files. Here, the data set will be used among all the tasks, so it is not included in the {ref}`forward_files `. Instead, it should be included in the submission's {ref}`forward_common_files `. 20 | 21 | ```{literalinclude} ../../examples/task/deepmd-kit.json 22 | :language: json 23 | :linenos: 24 | ``` 25 | -------------------------------------------------------------------------------- /doc/examples/g16.md: -------------------------------------------------------------------------------- 1 | # Running Gaussian 16 with failure allowed 2 | 3 | Typically, a task will retry three times if the exit code is not zero. Sometimes, one may allow non-zero code. For example, when running large amounts of Gaussian 16 single-point calculation tasks, some of the Gaussian 16 tasks may throw SCF errors and return a non-zero code. One can append `||:` to the command: 4 | 5 | ```{literalinclude} ../../examples/task/g16.json 6 | :language: json 7 | :linenos: 8 | ``` 9 | 10 | This command ensures the task will always provide zero code. 11 | -------------------------------------------------------------------------------- /doc/examples/shell.md: -------------------------------------------------------------------------------- 1 | # Running multiple MD tasks on a GPU workstation 2 | 3 | In this example, we are going to show how to run multiple MD tasks on a GPU workstation. This workstation does not install any job scheduling packages installed, so we will use `Shell` as {ref}`batch_type `. 4 | 5 | ```{literalinclude} ../../examples/machine/mandu.json 6 | :language: json 7 | :linenos: 8 | ``` 9 | 10 | The workstation has 48 cores of CPUs and 8 RTX3090 cards. Here we hope each card runs 6 tasks at the same time, as each task does not consume too many GPU resources. Thus, {ref}`strategy/if_cuda_multi_devices ` is set to `true` and {ref}`para_deg ` is set to 6. 11 | 12 | ```{literalinclude} ../../examples/resources/mandu.json 13 | :language: json 14 | :linenos: 15 | ``` 16 | 17 | Note that {ref}`group_size ` should be set to `0` (means infinity) to ensure there is only one job and avoid running multiple jobs at the same time. 18 | -------------------------------------------------------------------------------- /doc/examples/template.md: -------------------------------------------------------------------------------- 1 | # Customizing the submission script header 2 | 3 | When submitting jobs to some clusters, such as the [Tiger Cluster](https://researchcomputing.princeton.edu/systems/tiger) at Princeton University, the Slurm header is quite different from the standard one. In this case, DPDispatcher allows users to customize the templates by setting {dargs:argument}`strategy/customized_script_header_template_file ` to a template file: 4 | 5 | ```{literalinclude} ../../examples/resources/tiger.json 6 | :language: json 7 | :linenos: 8 | ``` 9 | 10 | `template.slurm` is the template file, where {meth}`str.format` is used to format the template with [Resources Parameters](resources): 11 | 12 | ```{literalinclude} ../../examples/resources/template.slurm 13 | :linenos: 14 | ``` 15 | 16 | See [Python Format String Syntax](https://docs.python.org/3/library/string.html#formatstrings) for how to insert parameters inside the template. 17 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. deepmd-kit documentation master file, created by 2 | sphinx-quickstart on Sat Nov 21 18:36:24 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | DPDispatcher's documentation 7 | ====================================== 8 | 9 | DPDispatcher is a Python package used to generate HPC (High Performance Computing) scheduler systems (Slurm/PBS/LSF/JH_SCheduler/dpcloudserver) jobs input scripts and submit these scripts to HPC systems and poke until they finish. 10 | 11 | DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs is running on remote systems connected by SSH). 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: Contents: 16 | 17 | 18 | install 19 | getting-started 20 | context 21 | batch 22 | machine 23 | resources 24 | task 25 | env 26 | run 27 | cli 28 | api/api 29 | 30 | .. toctree:: 31 | :caption: Examples 32 | :glob: 33 | 34 | examples/expanse 35 | examples/g16 36 | examples/shell 37 | examples/template 38 | 39 | .. toctree:: 40 | :caption: Project details 41 | :glob: 42 | 43 | credits 44 | 45 | Indices and tables 46 | ================== 47 | 48 | * :ref:`genindex` 49 | * :ref:`modindex` 50 | * :ref:`search` 51 | -------------------------------------------------------------------------------- /doc/install.md: -------------------------------------------------------------------------------- 1 | # Install DPDispatcher 2 | 3 | DPDispatcher can installed by `pip`: 4 | 5 | ```bash 6 | pip install dpdispatcher 7 | ``` 8 | 9 | To add [Bohrium](https://bohrium.dp.tech/) support, execute 10 | 11 | ```bash 12 | pip install dpdispatcher[bohrium] 13 | ``` 14 | -------------------------------------------------------------------------------- /doc/machine.rst: -------------------------------------------------------------------------------- 1 | Machine parameters 2 | ====================================== 3 | .. note:: 4 | One can load, modify, and export the input file by using our effective web-based tool `DP-GUI `_ online or hosted using the :ref:`command line interface ` :code:`dpdisp gui`. All parameters below can be set in DP-GUI. By clicking "SAVE JSON", one can download the input file. 5 | 6 | .. dargs:: 7 | :module: dpdispatcher.arginfo 8 | :func: machine_dargs 9 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/pep723.rst: -------------------------------------------------------------------------------- 1 | .. dargs:: 2 | :module: dpdispatcher.run 3 | :func: pep723_args 4 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | .[docs] 2 | -------------------------------------------------------------------------------- /doc/resources.rst: -------------------------------------------------------------------------------- 1 | Resources parameters 2 | ====================================== 3 | .. note:: 4 | One can load, modify, and export the input file by using our effective web-based tool `DP-GUI `_ online or hosted using the :ref:`command line interface ` :code:`dpdisp gui`. All parameters below can be set in DP-GUI. By clicking "SAVE JSON", one can download the input file for. 5 | 6 | .. dargs:: 7 | :module: dpdispatcher.arginfo 8 | :func: resources_dargs 9 | -------------------------------------------------------------------------------- /doc/run.md: -------------------------------------------------------------------------------- 1 | # Run Python scripts 2 | 3 | DPDispatcher can be used to directly run a single Python script: 4 | 5 | ```sh 6 | dpdisp run script.py 7 | ``` 8 | 9 | The script must include [inline script metadata](https://packaging.python.org/en/latest/specifications/inline-script-metadata/) compliant with [PEP 723](https://peps.python.org/pep-0723/). 10 | An example of the script is shown below. 11 | 12 | ```{literalinclude} ../examples/dpdisp_run.py 13 | :language: py 14 | :linenos: 15 | ``` 16 | 17 | The PEP 723 metadata entries for `tool.dpdispatcher` are defined as follows: 18 | 19 | ```{eval-rst} 20 | .. include:: pep723.rst 21 | ``` 22 | -------------------------------------------------------------------------------- /doc/task.rst: -------------------------------------------------------------------------------- 1 | Task parameters 2 | ====================================== 3 | .. note:: 4 | One can load, modify, and export the input file by using our effective web-based tool `DP-GUI `_ online or hosted using the :ref:`command line interface ` :code:`dpdisp gui`. All parameters below can be set in DP-GUI. By clicking "SAVE JSON", one can download the input file. 5 | 6 | .. dargs:: 7 | :module: dpdispatcher.arginfo 8 | :func: task_dargs 9 | -------------------------------------------------------------------------------- /dpdispatcher/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "DeepModeling Team" 2 | __copyright__ = "Copyright 2019-2023, DeepModeling" 3 | __status__ = "Production" 4 | try: 5 | from ._version import version as __version__ 6 | except ImportError: 7 | __version__ = "unknown" 8 | 9 | import dpdispatcher.contexts # noqa: F401 10 | import dpdispatcher.machines # noqa: F401 11 | from dpdispatcher.machine import Machine 12 | from dpdispatcher.submission import Job, Resources, Submission, Task 13 | 14 | __all__ = [ 15 | "__version__", 16 | "Machine", 17 | "Submission", 18 | "Task", 19 | "Job", 20 | "Resources", 21 | ] 22 | -------------------------------------------------------------------------------- /dpdispatcher/__main__.py: -------------------------------------------------------------------------------- 1 | """Package dp entry point.""" 2 | 3 | from dpdispatcher.dpdisp import ( 4 | main, 5 | ) 6 | 7 | if __name__ == "__main__": 8 | main() 9 | -------------------------------------------------------------------------------- /dpdispatcher/arginfo.py: -------------------------------------------------------------------------------- 1 | from dpdispatcher.machine import Machine 2 | from dpdispatcher.submission import Resources, Task 3 | 4 | resources_dargs = Resources.arginfo 5 | machine_dargs = Machine.arginfo 6 | task_dargs = Task.arginfo 7 | -------------------------------------------------------------------------------- /dpdispatcher/contexts/__init__.py: -------------------------------------------------------------------------------- 1 | """Contexts.""" 2 | 3 | import importlib 4 | from pathlib import Path 5 | 6 | PACKAGE_BASE = "dpdispatcher.contexts" 7 | NOT_LOADABLE = ("__init__.py",) 8 | 9 | for module_file in Path(__file__).parent.glob("*.py"): 10 | if module_file.name not in NOT_LOADABLE: 11 | module_name = f".{module_file.stem}" 12 | importlib.import_module(module_name, PACKAGE_BASE) 13 | -------------------------------------------------------------------------------- /dpdispatcher/dlog.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import warnings 5 | 6 | dlog = logging.getLogger("dpdispatcher") 7 | dlog.propagate = False 8 | dlog.setLevel(logging.INFO) 9 | cwd_logfile_path = os.path.join(os.getcwd(), "dpdispatcher.log") 10 | dlogf = logging.FileHandler(cwd_logfile_path, delay=True) 11 | try: 12 | dlog.addHandler(dlogf) 13 | dlog.info(f"LOG INIT:dpdispatcher log direct to {cwd_logfile_path}") 14 | except PermissionError: 15 | dlog.removeHandler(dlogf) 16 | warnings.warn( 17 | f"dump logfile dpdispatcher.log to {cwd_logfile_path} meet permission error. redirect the log to ~/dpdispatcher.log" 18 | ) 19 | dlogf = logging.FileHandler( 20 | os.path.join(os.path.expanduser("~"), "dpdispatcher.log"), delay=True 21 | ) 22 | dlog.addHandler(dlogf) 23 | dlog.info("LOG INIT:dpdispatcher log init at ~/dpdispatcher.log") 24 | 25 | dlogf_formatter = logging.Formatter("%(asctime)s - %(levelname)s : %(message)s") 26 | dlogf.setFormatter(dlogf_formatter) 27 | # dlog.addHandler(dlogf) 28 | 29 | dlog_stdout = logging.StreamHandler(sys.stdout) 30 | dlog_stdout.setFormatter(dlogf_formatter) 31 | dlog.addHandler(dlog_stdout) 32 | 33 | __all__ = [ 34 | "dlog", 35 | ] 36 | -------------------------------------------------------------------------------- /dpdispatcher/dpcloudserver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/dpdispatcher/dpcloudserver/__init__.py -------------------------------------------------------------------------------- /dpdispatcher/dpcloudserver/client.py: -------------------------------------------------------------------------------- 1 | """Provide backward compatbility with dflow.""" 2 | 3 | from dpdispatcher.utils.dpcloudserver.client import RequestInfoException 4 | 5 | __all__ = [ 6 | "RequestInfoException", 7 | ] 8 | -------------------------------------------------------------------------------- /dpdispatcher/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | """Entry points.""" 2 | -------------------------------------------------------------------------------- /dpdispatcher/entrypoints/gui.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: LGPL-3.0-or-later 2 | """DP-GUI entrypoint.""" 3 | 4 | 5 | def start_dpgui(*, port: int, bind_all: bool, **kwargs): 6 | """Host DP-GUI server. 7 | 8 | Parameters 9 | ---------- 10 | port : int 11 | The port to serve DP-GUI on. 12 | bind_all : bool 13 | Serve on all public interfaces. This will expose your DP-GUI instance 14 | to the network on both IPv4 and IPv6 (where available). 15 | **kwargs 16 | additional arguments 17 | 18 | Raises 19 | ------ 20 | ModuleNotFoundError 21 | The dpgui package is not installed 22 | """ 23 | try: 24 | from dpgui import ( 25 | start_dpgui, 26 | ) 27 | except ModuleNotFoundError as e: 28 | raise ModuleNotFoundError( 29 | "To use DP-GUI, please install the dpgui package:\npip install dpgui" 30 | ) from e 31 | start_dpgui(port=port, bind_all=bind_all) 32 | -------------------------------------------------------------------------------- /dpdispatcher/entrypoints/run.py: -------------------------------------------------------------------------------- 1 | """Run PEP 723 script.""" 2 | 3 | from dpdispatcher.run import run_pep723 4 | 5 | 6 | def run(*, filename: str): 7 | with open(filename) as f: 8 | script = f.read() 9 | run_pep723(script) 10 | -------------------------------------------------------------------------------- /dpdispatcher/entrypoints/submission.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from dpdispatcher.dlog import dlog 4 | from dpdispatcher.submission import Submission 5 | from dpdispatcher.utils.job_status import JobStatus 6 | from dpdispatcher.utils.record import record 7 | 8 | 9 | def handle_submission( 10 | *, 11 | submission_hash: str, 12 | download_terminated_log: bool = False, 13 | download_finished_task: bool = False, 14 | clean: bool = False, 15 | reset_fail_count: bool = False, 16 | ): 17 | """Handle terminated submission. 18 | 19 | Parameters 20 | ---------- 21 | submission_hash : str 22 | Submission hash to download. 23 | download_terminated_log : bool, optional 24 | Download log files of terminated tasks. 25 | download_finished_task : bool, optional 26 | Download finished tasks. 27 | clean : bool, optional 28 | Clean submission. 29 | reset_fail_count : bool, optional 30 | Reset fail count of all jobs to zero. 31 | 32 | Raises 33 | ------ 34 | ValueError 35 | At least one action should be specified. 36 | """ 37 | if ( 38 | int(download_terminated_log) 39 | + int(download_finished_task) 40 | + int(clean) 41 | + int(reset_fail_count) 42 | == 0 43 | ): 44 | raise ValueError("At least one action should be specified.") 45 | 46 | submission_file = record.get_submission(submission_hash) 47 | submission = Submission.submission_from_json(str(submission_file)) 48 | submission.belonging_tasks = [ 49 | task for job in submission.belonging_jobs for task in job.job_task_list 50 | ] 51 | # TODO: for unclear reason, the submission_hash may be changed 52 | submission.submission_hash = submission_hash 53 | submission.machine.context.bind_submission(submission) 54 | if reset_fail_count: 55 | for job in submission.belonging_jobs: 56 | job.fail_count = 0 57 | # save to remote and local 58 | submission.submission_to_json() 59 | record.write(submission) 60 | if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0: 61 | # if only reset_fail_count, no need to update submission state (expensive) 62 | return 63 | submission.update_submission_state() 64 | submission.submission_to_json() 65 | record.write(submission) 66 | 67 | terminated_tasks = [] 68 | finished_tasks = [] 69 | for task in submission.belonging_tasks: 70 | task.get_task_state(submission.machine.context) 71 | if task.task_state == JobStatus.terminated: 72 | terminated_tasks.append(task) 73 | elif task.task_state == JobStatus.finished: 74 | finished_tasks.append(task) 75 | submission.belonging_tasks = [] 76 | 77 | if download_terminated_log: 78 | for task in terminated_tasks: 79 | task.backward_files = [task.outlog, task.errlog] 80 | submission.belonging_tasks += terminated_tasks 81 | if download_finished_task: 82 | submission.belonging_tasks += finished_tasks 83 | 84 | submission.download_jobs() 85 | 86 | if download_terminated_log: 87 | terminated_log_files = [] 88 | for task in terminated_tasks: 89 | assert submission.local_root is not None 90 | terminated_log_files.append( 91 | Path(submission.local_root) / task.task_work_path / task.outlog 92 | ) 93 | terminated_log_files.append( 94 | Path(submission.local_root) / task.task_work_path / task.errlog 95 | ) 96 | 97 | dlog.info( 98 | "Terminated logs are downloaded into:\n " 99 | + "\n ".join([str(f) for f in terminated_log_files]) 100 | ) 101 | 102 | if clean: 103 | submission.clean_jobs() 104 | -------------------------------------------------------------------------------- /dpdispatcher/machines/__init__.py: -------------------------------------------------------------------------------- 1 | """Machines.""" 2 | 3 | import importlib 4 | from pathlib import Path 5 | 6 | PACKAGE_BASE = "dpdispatcher.machines" 7 | NOT_LOADABLE = ("__init__.py",) 8 | 9 | for module_file in Path(__file__).parent.glob("*.py"): 10 | if module_file.name not in NOT_LOADABLE: 11 | module_name = f".{module_file.stem}" 12 | importlib.import_module(module_name, PACKAGE_BASE) 13 | -------------------------------------------------------------------------------- /dpdispatcher/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Utils.""" 2 | -------------------------------------------------------------------------------- /dpdispatcher/utils/dpcloudserver/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import Client 2 | 3 | __all__ = ["Client"] 4 | -------------------------------------------------------------------------------- /dpdispatcher/utils/dpcloudserver/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | HTTP_TIME_OUT = 30 4 | 5 | API_HOST = os.environ.get("DPDISPATCHER_LEBESGUE_API_HOST", "https://bohrium.dp.tech/") 6 | API_LOGGER_STACK_INFO = os.environ.get("API_LOGGER_STACK_INFO", "") 7 | ALI_STS_ENDPOINT = os.environ.get( 8 | "DPDISPATCHER_LEBESGUE_ALI_STS_ENDPOINT", "http://oss-cn-shenzhen.aliyuncs.com" 9 | ) 10 | ALI_STS_BUCKET_NAME = os.environ.get( 11 | "DPDISPATCHER_LEBESGUE_ALI_STS_BUCKET_NAME", "dpcloudserver" 12 | ) 13 | ALI_OSS_BUCKET_URL = os.environ.get( 14 | "DPDISPATCHER_LEBESGUE_ALI_OSS_BUCKET_URL", 15 | "https://dpcloudserver.oss-cn-shenzhen.aliyuncs.com/", 16 | ) 17 | -------------------------------------------------------------------------------- /dpdispatcher/utils/dpcloudserver/retcode.py: -------------------------------------------------------------------------------- 1 | # 2开头的错误代码第二位代表错误等级 2 | # 0. 严重错误; 1. 普通错误; 2. 规则错误; 3. 一般信息; 4. 未知错误 3 | class RETCODE: 4 | OK = "0000" # 正常 5 | DBERR = "2000" # 数据库异常 6 | THIRDERR = "2001" # 第三方异常 7 | DATAERR = "2002" # 数据异常 8 | IOERR = "2003" # IO异常 9 | 10 | TOKENINVALID = "2100" # 登陆错误 11 | PARAMERR = "2101" # 参数错误 12 | USERERR = "2102" # 用户异常 13 | ROLEERR = "2103" # 权限错误 14 | PWDERR = "2104" # 密码错误 15 | VERIFYERR = "2105" # 验证错误 16 | 17 | REQERR = "2200" # 请求错误 18 | 19 | NODATA = "2300" # 无数据 20 | UNDERDEBUG = "2301" # debug模式下无法使用 21 | 22 | UNKOWNERR = "2400" # 未知错误 23 | -------------------------------------------------------------------------------- /dpdispatcher/utils/dpcloudserver/zip_file.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from zipfile import ZipFile 4 | 5 | # def zip_file_list(root_path, zip_filename, file_list=[]): 6 | # shutil.make_archive(base_name=zip_filename, 7 | # root_dir=root_path,) 8 | 9 | 10 | def zip_file_list(root_path, zip_filename, file_list=[]): 11 | out_zip_file = os.path.join(root_path, zip_filename) 12 | # print('debug: file_list', file_list) 13 | zip_obj = ZipFile(out_zip_file, "w") 14 | for f in file_list: 15 | matched_files = os.path.join(root_path, f) 16 | for ii in glob.glob(matched_files): 17 | # print('debug: matched_files:ii', ii) 18 | if os.path.isdir(ii): 19 | arcname = os.path.relpath(ii, start=root_path) 20 | zip_obj.write(ii, arcname) 21 | for root, dirs, files in os.walk(ii): 22 | for file in files: 23 | filename = os.path.join(root, file) 24 | arcname = os.path.relpath(filename, start=root_path) 25 | # print('debug: filename:arcname:root_path', filename, arcname, root_path) 26 | zip_obj.write(filename, arcname) 27 | else: 28 | arcname = os.path.relpath(ii, start=root_path) 29 | zip_obj.write(ii, arcname) 30 | zip_obj.close() 31 | return out_zip_file 32 | 33 | 34 | # def zip_files(root_path, out_file, selected=[]): 35 | # obj = ZipFile(out_file, "w") 36 | # # change /xxx/ to /xxx or xxx to /xxx and pop '' 37 | # for i in range(len(selected)): 38 | # if not selected[i]: 39 | # selected.pop(i) 40 | # continue 41 | 42 | # selected[i] = selected[i].strip() 43 | # if selected[i].endswith('/'): 44 | # selected[i] = selected[i][:-1] 45 | # if not selected[i].startswith('/'): 46 | # selected[i] = '/{}'.format(selected[i]) 47 | 48 | # for root, dirs, files in os.walk(root_path): 49 | # for item in files: 50 | # filename = os.path.join(root, item) 51 | # arcname = filename.replace(root_path,'') 52 | # if not is_selected(arcname, selected): 53 | # continue 54 | 55 | # obj.write(filename, arcname) 56 | # if not obj.filelist: 57 | # return 58 | 59 | # obj.close() 60 | 61 | 62 | # def is_selected(arcname, selected): 63 | # if not selected: 64 | # return True 65 | 66 | # arcdir = os.path.dirname(arcname) 67 | # for s in selected: 68 | # if arcname == s: 69 | # return True 70 | 71 | # if arcdir == s: 72 | # return True 73 | 74 | # if arcname.startswith(s + '/'): 75 | # return True 76 | 77 | # return False 78 | 79 | 80 | def unzip_file(zip_file, out_dir="./"): 81 | obj = ZipFile(zip_file, "r") 82 | for item in obj.namelist(): 83 | obj.extract(item, out_dir) 84 | -------------------------------------------------------------------------------- /dpdispatcher/utils/job_status.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | 3 | 4 | class JobStatus(IntEnum): 5 | unsubmitted = 1 6 | waiting = 2 7 | running = 3 8 | terminated = 4 9 | finished = 5 10 | completing = 6 11 | unknown = 100 12 | 13 | 14 | # def __str__(self): 15 | # return repr(self) 16 | -------------------------------------------------------------------------------- /dpdispatcher/utils/record.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import List 4 | 5 | 6 | class Record: 7 | """Record failed or canceled submissions.""" 8 | 9 | def __init__(self) -> None: 10 | self.record_directory = Path.home() / ".dpdispatcher" / "submission" 11 | self.record_directory.mkdir(parents=True, exist_ok=True) 12 | 13 | def get_submissions(self) -> List[str]: 14 | """Get all stored submission hashes. 15 | 16 | Returns 17 | ------- 18 | list[str] 19 | List of submission hashes. 20 | """ 21 | return [ 22 | f.stem 23 | for f in self.record_directory.iterdir() 24 | if (f.is_file() and f.suffix == ".json") 25 | ] 26 | 27 | def write(self, submission) -> Path: 28 | """Write submission data to file. 29 | 30 | Parameters 31 | ---------- 32 | submission : dpdispatcher.Submission 33 | Submission data. 34 | 35 | Returns 36 | ------- 37 | pathlib.Path 38 | Path to submission data. 39 | """ 40 | submission_path = self.record_directory / f"{submission.submission_hash}.json" 41 | submission_path.write_text(json.dumps(submission.serialize(), indent=2)) 42 | return submission_path 43 | 44 | def get_submission(self, hash: str, not_exist_ok: bool = False) -> Path: 45 | """Get submission data by hash. 46 | 47 | Parameters 48 | ---------- 49 | hash : str 50 | Hash of submission data. 51 | 52 | Returns 53 | ------- 54 | pathlib.Path 55 | Path to submission data. 56 | """ 57 | submission_file = self.record_directory / f"{hash}.json" 58 | if not not_exist_ok and not submission_file.is_file(): 59 | raise FileNotFoundError(f"Submission file not found: {submission_file}") 60 | return submission_file 61 | 62 | def remove(self, hash: str): 63 | """Remove submission data by hash. 64 | 65 | Call this method when the remote directory is cleaned. 66 | 67 | Parameters 68 | ---------- 69 | hash : str 70 | Hash of submission data. 71 | """ 72 | path = self.get_submission(hash, not_exist_ok=True) 73 | if path.is_file(): 74 | path.unlink() 75 | 76 | 77 | # the record object can be globally used 78 | record = Record() 79 | __all__ = ["record"] 80 | -------------------------------------------------------------------------------- /examples/dpdisp_run.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # # dpdispatcher doesn't use `requires-python` and `dependencies` 3 | # requires-python = ">=3" 4 | # dependencies = [ 5 | # ] 6 | # [tool.dpdispatcher] 7 | # work_base = "./" 8 | # forward_common_files=[] 9 | # backward_common_files=[] 10 | # [tool.dpdispatcher.machine] 11 | # batch_type = "Shell" 12 | # local_root = "./" 13 | # context_type = "LazyLocalContext" 14 | # [tool.dpdispatcher.resources] 15 | # number_node = 1 16 | # cpu_per_node = 1 17 | # gpu_per_node = 0 18 | # group_size = 0 19 | # [[tool.dpdispatcher.task_list]] 20 | # # no need to contain the script filename 21 | # command = "python" 22 | # # can be a glob pattern 23 | # task_work_path = "./" 24 | # forward_files = [] 25 | # backward_files = ["log"] 26 | # /// 27 | 28 | print("hello world!") 29 | -------------------------------------------------------------------------------- /examples/machine/expanse.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_type": "Slurm", 3 | "local_root": "./", 4 | "remote_root": "/expanse/lustre/scratch/njzjz/temp_project/dpgen_workdir", 5 | "clean_asynchronously": true, 6 | "context_type": "SSHContext", 7 | "remote_profile": { 8 | "hostname": "login.expanse.sdsc.edu", 9 | "username": "njzjz", 10 | "port": 22 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /examples/machine/lazy_local.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_type": "Shell", 3 | "local_root": "./", 4 | "context_type": "LazyLocalContext" 5 | } 6 | -------------------------------------------------------------------------------- /examples/machine/mandu.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_type": "Shell", 3 | "local_root": "./", 4 | "remote_root": "/data2/jinzhe/dpgen_workdir", 5 | "clean_asynchronously": true, 6 | "context_type": "SSHContext", 7 | "remote_profile": { 8 | "hostname": "mandu.iqb.rutgers.edu", 9 | "username": "jz748", 10 | "port": 22 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /examples/machine/ssh_proxy_command.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_type": "Shell", 3 | "context_type": "SSHContext", 4 | "local_root": "./", 5 | "remote_root": "/home/user/work", 6 | "remote_profile": { 7 | "hostname": "internal-server.company.com", 8 | "username": "user", 9 | "port": 22, 10 | "key_filename": "~/.ssh/id_rsa", 11 | "proxy_command": "ssh -W internal-server.company.com:22 -i ~/.ssh/jump_key jumpuser@bastion.company.com" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /examples/resources/expanse_cpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "number_node": 1, 3 | "cpu_per_node": 1, 4 | "gpu_per_node": 0, 5 | "queue_name": "shared", 6 | "group_size": 1, 7 | "custom_flags": [ 8 | "#SBATCH -c 32", 9 | "#SBATCH --mem=16G", 10 | "#SBATCH --time=48:00:00", 11 | "#SBATCH --account=rut149", 12 | "#SBATCH --requeue" 13 | ], 14 | "source_list": [ 15 | "activate /home/njzjz/deepmd-kit" 16 | ], 17 | "envs": { 18 | "OMP_NUM_THREADS": 4, 19 | "TF_INTRA_OP_PARALLELISM_THREADS": 4, 20 | "TF_INTER_OP_PARALLELISM_THREADS": 8, 21 | "DP_AUTO_PARALLELIZATION": 1 22 | }, 23 | "batch_type": "Slurm", 24 | "kwargs": { 25 | "custom_gpu_line": "#SBATCH --gpus=0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /examples/resources/mandu.json: -------------------------------------------------------------------------------- 1 | { 2 | "number_node": 1, 3 | "cpu_per_node": 48, 4 | "gpu_per_node": 8, 5 | "queue_name": "shell", 6 | "group_size": 0, 7 | "strategy": { 8 | "if_cuda_multi_devices": true 9 | }, 10 | "source_list": [ 11 | "activate /home/jz748/deepmd-kit" 12 | ], 13 | "envs": { 14 | "OMP_NUM_THREADS": 1, 15 | "TF_INTRA_OP_PARALLELISM_THREADS": 1, 16 | "TF_INTER_OP_PARALLELISM_THREADS": 1 17 | }, 18 | "para_deg": 6 19 | } 20 | -------------------------------------------------------------------------------- /examples/resources/template.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --parsable 3 | #SBATCH --nodes={number_node} 4 | #SBATCH --ntasks-per-node={cpu_per_node} 5 | #SBATCH --qos={kwargs[qos]} 6 | #SBATCH --time=01:02:00 7 | #SBATCH --mem-per-cpu=4G 8 | -------------------------------------------------------------------------------- /examples/resources/tiger.json: -------------------------------------------------------------------------------- 1 | { 2 | "number_node": 1, 3 | "cpu_per_node": 32, 4 | "kwargs":{ 5 | "qos": "tiger-vshort" 6 | }, 7 | "source_list": ["activate abacus_env"], 8 | "strategy": { 9 | "customized_script_header_template_file": "./template.slurm" 10 | }, 11 | "group_size": 2000 12 | } 13 | -------------------------------------------------------------------------------- /examples/task/deepmd-kit.json: -------------------------------------------------------------------------------- 1 | { 2 | "command": "dp train input.json && dp freeze && dp compress", 3 | "task_work_path": "model1/", 4 | "forward_files": [ 5 | "input.json" 6 | ], 7 | "backward_files": [ 8 | "frozen_model.pb", 9 | "frozen_model_compressed.pb" 10 | ], 11 | "outlog": "log", 12 | "errlog": "err" 13 | } 14 | -------------------------------------------------------------------------------- /examples/task/g16.json: -------------------------------------------------------------------------------- 1 | { 2 | "command": "g16 < input > output ||:", 3 | "task_work_path": "p1/", 4 | "forward_files": [ 5 | "input" 6 | ], 7 | "backward_files": [ 8 | "output" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61", "setuptools_scm[toml]>=7"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "dpdispatcher" 7 | dynamic = ["version"] 8 | description = "Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish" 9 | authors = [ 10 | { name = "DeepModeling" }, 11 | ] 12 | license = { file = "LICENSE" } 13 | classifiers = [ 14 | "Programming Language :: Python :: 3.7", 15 | "Programming Language :: Python :: 3.8", 16 | "Programming Language :: Python :: 3.9", 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Operating System :: POSIX :: Linux", 20 | "Operating System :: MacOS :: MacOS X", 21 | "Operating System :: Microsoft :: Windows", 22 | "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)", 23 | ] 24 | dependencies = [ 25 | 'paramiko', 26 | 'dargs>=0.4.1', 27 | 'requests', 28 | 'tqdm>=4.9.0', 29 | 'typing_extensions; python_version < "3.7"', 30 | 'pyyaml', 31 | 'tomli >= 1.1.0; python_version < "3.11"', 32 | ] 33 | requires-python = ">=3.7" 34 | readme = "README.md" 35 | keywords = ["dispatcher", "hpc", "slurm", "lsf", "pbs", "ssh", "jh_unischeduler"] 36 | 37 | [project.urls] 38 | Homepage = "https://github.com/deepmodeling/dpdispatcher" 39 | documentation = "https://docs.deepmodeling.com/projects/dpdispatcher" 40 | repository = "https://github.com/deepmodeling/dpdispatcher" 41 | 42 | [project.scripts] 43 | dpdisp = "dpdispatcher.dpdisp:main" 44 | 45 | [project.entry-points."dpgui"] 46 | "DPDispatcher Machine" = "dpdispatcher.arginfo:machine_dargs" 47 | "DPDispatcher Resources" = "dpdispatcher.arginfo:resources_dargs" 48 | "DPDispatcher Task" = "dpdispatcher.arginfo:task_dargs" 49 | 50 | [project.optional-dependencies] 51 | docs = [ 52 | 'sphinx', 53 | 'myst-parser', 54 | 'sphinx-book-theme', 55 | 'numpydoc', 56 | 'deepmodeling-sphinx>=0.3.0', 57 | 'dargs>=0.3.1', 58 | 'sphinx-argparse<0.5.0', 59 | ] 60 | cloudserver = ["oss2", "tqdm", "bohrium-sdk"] 61 | bohrium = ["oss2", "tqdm", "bohrium-sdk"] 62 | gui = [ 63 | "dpgui", 64 | ] 65 | test = [ 66 | "dpgui", 67 | ] 68 | 69 | [tool.setuptools.packages.find] 70 | include = ["dpdispatcher*"] 71 | 72 | [tool.setuptools_scm] 73 | write_to = "dpdispatcher/_version.py" 74 | 75 | [tool.pyright] 76 | include = ['dpdispatcher'] 77 | exclude = [ 78 | 'dpdispatcher/dpcloudserver/temp_test.py', 79 | 'dpdispatcher/_version.py', 80 | ] 81 | 82 | [tool.isort] 83 | profile = "black" 84 | 85 | [tool.ruff.lint] 86 | select = [ 87 | "E", # errors 88 | "F", # pyflakes 89 | "D", # pydocstyle 90 | "UP", # pyupgrade 91 | "I", # isort 92 | ] 93 | ignore = [ 94 | "E501", # line too long 95 | "F841", # local variable is assigned to but never used 96 | "E741", # ambiguous variable name 97 | "E402", # module level import not at top of file 98 | "D413", # missing blank line after last section 99 | "D416", # section name should end with a colon 100 | "D203", # 1 blank line required before class docstring 101 | "D107", # missing docstring in __init__ 102 | "D213", # multi-line docstring summary should start at the second line 103 | "D100", # TODO: missing docstring in public module 104 | "D101", # TODO: missing docstring in public class 105 | "D102", # TODO: missing docstring in public method 106 | "D103", # TODO: missing docstring in public function 107 | "D104", # TODO: missing docstring in public package 108 | "D105", # TODO: missing docstring in magic method 109 | "D205", # 1 blank line required between summary line and description 110 | "D401", # TODO: first line should be in imperative mood 111 | "D404", # TODO: first word of the docstring should not be This 112 | ] 113 | 114 | [tool.ruff.lint.pydocstyle] 115 | convention = "numpy" 116 | -------------------------------------------------------------------------------- /scripts/script_gen_dargs_docs.py: -------------------------------------------------------------------------------- 1 | # %% 2 | # import sys, os 3 | # sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..' ))) 4 | # import dpdispatcher 5 | from dpdispatcher.machine import Machine 6 | from dpdispatcher.submission import Resources, Task 7 | 8 | # %% 9 | resources_dargs_doc = Resources.arginfo().gen_doc() 10 | with open("../doc/resources-auto.rst", "w") as f: 11 | # print(resources_dargs_doc) 12 | f.write(resources_dargs_doc) 13 | 14 | machine_dargs_doc = Machine.arginfo().gen_doc() 15 | with open("../doc/machine-auto.rst", "w") as f: 16 | f.write(machine_dargs_doc) 17 | 18 | task_dargs_doc = Task.arginfo().gen_doc() 19 | with open("../doc/task-auto.rst", "w") as f: 20 | f.write(task_dargs_doc) 21 | 22 | 23 | # %% 24 | -------------------------------------------------------------------------------- /scripts/script_gen_dargs_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from dargs import ArgumentEncoder 4 | 5 | from dpdispatcher import Machine, Resources, Task 6 | 7 | resources_dargs = Resources.arginfo() 8 | with open("dpdispatcher-resources.json", "w") as f: 9 | json.dump(resources_dargs, f, cls=ArgumentEncoder) 10 | 11 | machine_dargs = Machine.arginfo() 12 | with open("dpdispatcher-machine.json", "w") as f: 13 | json.dump(machine_dargs, f, cls=ArgumentEncoder) 14 | 15 | task_dargs = Task.arginfo() 16 | with open("dpdispatcher-task.json", "w") as f: 17 | json.dump(task_dargs, f, cls=ArgumentEncoder) 18 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | out.txt -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/tests/__init__.py -------------------------------------------------------------------------------- /tests/batch.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_type": "pbs", 3 | "context_type": "lazy_local", 4 | "local_root" : "./test_batch_object", 5 | "remote_root" : "./tmp/" 6 | } -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import pathlib 4 | import sys 5 | 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 7 | 8 | import dpdispatcher # noqa: F401 9 | from dpdispatcher.base_context import BaseContext # noqa: F401 10 | from dpdispatcher.contexts.hdfs_context import HDFSContext # noqa: F401 11 | from dpdispatcher.contexts.lazy_local_context import LazyLocalContext # noqa: F401 12 | from dpdispatcher.contexts.local_context import LocalContext # noqa: F401 13 | from dpdispatcher.contexts.ssh_context import SSHContext, SSHSession # noqa: F401 14 | 15 | # test backward compatibility with dflow 16 | from dpdispatcher.dpcloudserver.client import RequestInfoException as _ # noqa: F401 17 | from dpdispatcher.entrypoints.run import run # noqa: F401 18 | from dpdispatcher.entrypoints.submission import handle_submission # noqa: F401 19 | from dpdispatcher.machine import Machine # noqa: F401 20 | from dpdispatcher.machines.distributed_shell import DistributedShell # noqa: F401 21 | from dpdispatcher.machines.dp_cloud_server import Lebesgue # noqa: F401 22 | from dpdispatcher.machines.JH_UniScheduler import JH_UniScheduler # noqa: F401 23 | from dpdispatcher.machines.lsf import LSF # noqa: F401 24 | from dpdispatcher.machines.pbs import PBS # noqa: F401 25 | from dpdispatcher.machines.shell import Shell # noqa: F401 26 | from dpdispatcher.machines.slurm import Slurm # noqa: F401 27 | from dpdispatcher.submission import Job, Resources, Submission, Task # noqa: F401 28 | from dpdispatcher.utils.hdfs_cli import HDFS # noqa: F401 29 | from dpdispatcher.utils.job_status import JobStatus # noqa: F401 30 | from dpdispatcher.utils.record import record # noqa: F401 31 | from dpdispatcher.utils.utils import RetrySignal, retry # noqa: F401 32 | 33 | 34 | def setUpModule(): 35 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 36 | 37 | 38 | def get_file_md5(file_path): 39 | return hashlib.md5(pathlib.Path(file_path).read_bytes()).hexdigest() 40 | -------------------------------------------------------------------------------- /tests/debug_test_class_submission_init.py: -------------------------------------------------------------------------------- 1 | # import os,sys,json,glob,shutil,uuid,time 2 | # import unittest 3 | # from unittest.mock import MagicMock, patch, PropertyMock 4 | 5 | # sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | # __package__ = 'tests' 7 | # from .context import LocalSession 8 | # from .context import LocalContext 9 | # from .context import PBS 10 | # from .context import JobStatus 11 | # from .context import Submission, Job, Task, Resources 12 | from sample_class import SampleClass 13 | 14 | # print('in', SampleClass.get_sample_empty_submission()) 15 | 16 | 17 | empty_submission = SampleClass.get_sample_empty_submission() 18 | task = SampleClass.get_sample_task() 19 | # print('TestSubmissionInit.test_reigister_task:self.empty_submission.belonging_tasks', empty_submission.belonging_tasks) 20 | empty_submission.register_task(task=task) 21 | # print('7890809', SampleClass.get_sample_empty_submission().belonging_tasks) 22 | empty_submission.register_task(task=task) 23 | # print('1441198', SampleClass.get_sample_empty_submission().belonging_tasks) 24 | # self.assertEqual([task], empty_submission.belonging_tasks) 25 | # print('out', SampleClass.get_sample_empty_submission()) 26 | # print('TestSubmissionInit.test_register_task_list:task_list', task_list) 27 | # empty_submission = SampleClass.get_sample_empty_submission() 28 | # task_list = SampleClass.get_sample_task_list() 29 | # empty_submission.register_task_list(task_list=task_list) 30 | # self.empty_submission.register_task_list(task_list=task_list) 31 | # self.assertEqual(task_list, empty_submission.belonging_tasks) 32 | 33 | # def tesk_generate_jobs(self): 34 | # task_list = SampleClass.get_sample_task_list() 35 | # self.submission.register_task_list(task_list=task_list) 36 | # self.submission.generate_jobs() 37 | # task1, task2, task3, task4 = task_list 38 | # task_ll = [job.job_task_list for job in self.submission.belonging_jobs] 39 | # self.assertEqual([[task3, task2], [task4, task1]], task_ll) 40 | -------------------------------------------------------------------------------- /tests/devel_test_JH_UniScheduler.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | from dpdispatcher.machine import Machine 6 | from dpdispatcher.submission import Resources, Submission, Task 7 | 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 9 | 10 | # task_need_resources has no effect 11 | with open("jsons/machine_jh_unischeduler.json") as f: 12 | mdata = json.load(f) 13 | 14 | machine = Machine.load_from_dict(mdata["machine"]) 15 | resources = Resources.load_from_dict(mdata["resources"]) 16 | 17 | submission = Submission( 18 | work_base="0_md/", 19 | machine=machine, 20 | resources=resources, 21 | forward_common_files=["graph.pb"], 22 | backward_common_files=[], 23 | ) 24 | 25 | task1 = Task( 26 | command="lmp -i input.lammps", 27 | task_work_path="bct-1/", 28 | forward_files=["conf.lmp", "input.lammps"], 29 | backward_files=["log.lammps"], 30 | ) 31 | task2 = Task( 32 | command="lmp -i input.lammps", 33 | task_work_path="bct-2/", 34 | forward_files=["conf.lmp", "input.lammps"], 35 | backward_files=["log.lammps"], 36 | ) 37 | task3 = Task( 38 | command="lmp -i input.lammps", 39 | task_work_path="bct-3/", 40 | forward_files=["conf.lmp", "input.lammps"], 41 | backward_files=["log.lammps"], 42 | ) 43 | task4 = Task( 44 | command="lmp -i input.lammps", 45 | task_work_path="bct-4/", 46 | forward_files=["conf.lmp", "input.lammps"], 47 | backward_files=["log.lammps"], 48 | ) 49 | submission.register_task_list( 50 | [ 51 | task1, 52 | task2, 53 | task3, 54 | task4, 55 | ] 56 | ) 57 | submission.run_submission(clean=True) 58 | -------------------------------------------------------------------------------- /tests/devel_test_ali_ehpc.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | # from .sample_class import SampleClass 7 | # from .context import Machine 8 | # from .context import Resources 9 | # from dpdispatcher.local_context import LocalSession 10 | # from dpdispatcher.local_context import LocalContext 11 | # from dpdispatcher.lazy_local_context import LazyLocalContext 12 | 13 | from dpdispatcher.machine import Machine 14 | from dpdispatcher.submission import Resources 15 | from tests.sample_class import SampleClass 16 | 17 | # from dpdispatcher.pbs import PBS 18 | 19 | # local_session = LocalSession({'work_path':'test_work_path/'}) 20 | # local_context = LocalContext(local_root='test_pbs_dir/', work_profile=local_session) 21 | 22 | # lazy_local_context = LazyLocalContext(local_root='/home/fengbo/10_dpdispatcher/dpdispatcher/tests/test_pbs_dir', work_profile=None) 23 | 24 | # ssh_session = SSHSession(hostname='39.106.84.25', remote_root='/home/fengbo/dp_remote', username='fengbo') 25 | # ssh_context = SSHContext(local_root='test_slurm_dir', ssh_session=ssh_session) 26 | 27 | # pbs = PBS(context=ssh_context) 28 | # pbs = PBS(context=local_context) 29 | # pbs = PBS(context=lazy_local_context) 30 | 31 | submission = SampleClass.get_sample_submission() 32 | # pbs = SampleClass.get_sample_pbs_local_context() 33 | # slurm = SampleClass.get_sample_slurm_local_context() 34 | 35 | with open("jsons/machine_ali_ehpc.json") as f: 36 | compute_dict = json.load(f) 37 | 38 | machine = Machine.load_from_dict(compute_dict["machine"]) 39 | resources = Resources.load_from_dict(compute_dict["resources"]) 40 | 41 | submission.resouces = resources 42 | submission.bind_machine(machine=machine) 43 | # submission.run_submission() 44 | # submission.run_submission(exit_on_submit=True) 45 | submission.run_submission() 46 | 47 | 48 | # resources = Resources(number_node=1, cpu_per_node=4, gpu_per_node=1, queue_name="V100_8_32", group_size=2, if_cuda_multi_devices=True) 49 | # submission = Submission(work_base='0_md/', resources=resources, forward_common_files=['graph.pb'], backward_common_files=[]) #, batch=PBS) 50 | # task1 = Task(command='lmp_serial -i input.lammps', task_work_path='bct-1/', forward_files=['conf.lmp', 'input.lammps'], backward_files=['log.lammps'], task_need_resources=1) 51 | # task2 = Task(command='lmp_serial -i input.lammps', task_work_path='bct-2/', forward_files=['conf.lmp', 'input.lammps'], backward_files=['log.lammps'], task_need_resources=0.25) 52 | # task3 = Task(command='lmp_serial -i input.lammps', task_work_path='bct-3/', forward_files=['conf.lmp', 'input.lammps'], backward_files=['log.lammps'], task_need_resources=0.25) 53 | # task4 = Task(command='lmp_serial -i input.lammps', task_work_path='bct-4/', forward_files=['conf.lmp', 'input.lammps'], backward_files=['log.lammps'], task_need_resources=0.5) 54 | # submission.register_task_list([task1, task2, task3, task4, ]) 55 | # submission.generate_jobs() 56 | # submission.bind_batch(batch=pbs) 57 | # for job in submission.belonging_jobs: 58 | # job.job_to_json() 59 | # print('111', submission) 60 | # submission2 = Submission.recover_jobs_from_json('./jr.json') 61 | # print('222', submission2) 62 | # print(submission==submission2) 63 | 64 | # submission1.dump_jobs_fo_json() 65 | # submission2 = Submission.submission_from_json('jsons/submission.json') 66 | # print(677, submission==submission2) 67 | # print(submission1.belonging_jobs) 68 | # print(local_context) 69 | -------------------------------------------------------------------------------- /tests/devel_test_dp_cloud_server.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | 7 | # from dpdispatcher.batch_object import BatchObject 8 | from dpdispatcher.machine import Machine 9 | from dpdispatcher.submission import Resources, Submission, Task 10 | 11 | # from dpdispatcher.slurm import SlurmResources, Slurm 12 | 13 | # local_session = LocalSession({'work_path':'temp2'}) 14 | # local_context = LocalContext(local_root='test_slurm_dir/', work_profile=local_session) 15 | # lazy_local_context = LazyLocalContext(local_root='./') 16 | 17 | 18 | # machine_dict = dict(hostname='localhost', remote_root='/home/dp/dpdispatcher/tests/temp2', username='dp') 19 | # ssh_session = SSHSession(**machine_dict) 20 | # ssh_session = SSHSession(hostname='8.131.233.55', remote_root='/home/dp/dp_remote', username='dp') 21 | # ssh_context = SSHContext(local_root='test_slurm_dir', ssh_session=ssh_session) 22 | # slurm = Slurm(context=ssh_context) 23 | # slurm = Slurm(context=lazy_local_context) 24 | 25 | # resources = Resources(number_node=1, cpu_per_node=4, gpu_per_node=0, queue_name="1 * NVIDIA P100", group_size=4) 26 | # slurm_sbatch_dict={'mem': '10G', 'cpus_per_task':1, 'time': "120:0:0"} 27 | # slurm_resources = SlurmResources(resources=resources, slurm_sbatch_dict=slurm_sbatch_dict) 28 | 29 | 30 | # dp_cloud_server_context = DpCloudServerContext( 31 | # local_root='test_context_dir/', 32 | # username='yfb222333', 33 | # password='yfb222333') 34 | # dp_cloud_server = DpCloudServer(context=dp_cloud_server_context) 35 | # with open('test_dp_cloud_server.json', 'r') as f: 36 | # jdata = json.load(f) 37 | with open("jsons/machine_dp_cloud_server.json") as f: 38 | compute_dict = json.load(f) 39 | 40 | machine = Machine.load_from_dict(compute_dict["machine"]) 41 | resources = Resources.load_from_dict(compute_dict["resources"]) 42 | 43 | task1 = Task( 44 | command="lmp -i input.lammps", 45 | task_work_path="bct-1/", 46 | forward_files=["conf.lmp", "input.lammps"], 47 | backward_files=["log.lammps"], 48 | ) 49 | task2 = Task( 50 | command="lmp -i input.lammps", 51 | task_work_path="bct-2/", 52 | forward_files=["conf.lmp", "input.lammps"], 53 | backward_files=["log.lammps"], 54 | ) 55 | task3 = Task( 56 | command="lmp -i input.lammps", 57 | task_work_path="bct-3/", 58 | forward_files=["conf.lmp", "input.lammps"], 59 | backward_files=["log.lammps"], 60 | ) 61 | task4 = Task( 62 | command="lmp -i input.lammps", 63 | task_work_path="bct-4/", 64 | forward_files=["conf.lmp", "input.lammps"], 65 | backward_files=["log.lammps"], 66 | ) 67 | task_list = [ 68 | task1, 69 | task2, 70 | task3, 71 | task4, 72 | ] 73 | 74 | submission = Submission( 75 | work_base="0_md/", 76 | machine=machine, 77 | resources=resources, 78 | forward_common_files=["graph.pb"], 79 | backward_common_files=[], 80 | task_list=task_list, 81 | ) 82 | 83 | submission.run_submission() 84 | -------------------------------------------------------------------------------- /tests/devel_test_lazy_ali_ehpc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 5 | 6 | from dpdispatcher.lazy_local_context import LazyLocalContext 7 | from dpdispatcher.pbs import PBS 8 | from dpdispatcher.submission import Resources, Submission, Task 9 | 10 | # local_session = LocalSession({'work_path':'temp2'}) 11 | # local_context = LocalContext(local_root='temp1/0_md', work_profile=local_session) 12 | lazy_local_context = LazyLocalContext( 13 | local_root="/home/fengbo/10_dpdispatcher/dpdispatcher/tests/temp3", 14 | work_profile=None, 15 | ) 16 | 17 | # pbs = PBS(context=local_context) 18 | pbs = PBS(context=lazy_local_context) 19 | 20 | resources = Resources( 21 | number_node=1, 22 | cpu_per_node=4, 23 | gpu_per_node=1, 24 | queue_name="V100_8_32", 25 | group_size=4, 26 | if_cuda_multi_devices=True, 27 | ) 28 | submission = Submission(work_base="0_md", resources=resources) 29 | task1 = Task(command="lmp_serial -i input.lammps", task_work_path="bct-1") 30 | task2 = Task(command="lmp_serial -i input.lammps", task_work_path="bct-2") 31 | task3 = Task(command="lmp_serial -i input.lammps", task_work_path="bct-3") 32 | task4 = Task(command="lmp_serial -i input.lammps", task_work_path="bct-4") 33 | submission.register_task_list( 34 | [ 35 | task1, 36 | task2, 37 | task3, 38 | task4, 39 | ] 40 | ) 41 | submission.generate_jobs() 42 | submission.bind_batch(batch=pbs) 43 | # for job in submission.belonging_jobs: 44 | # job.job_to_json() 45 | # print('111', submission) 46 | # submission2 = Submission.recover_jobs_from_json('./jr.json') 47 | # print('222', submission2) 48 | # print(submission==submission2) 49 | submission.run_submission() 50 | 51 | # submission1.dump_jobs_fo_json() 52 | # submission2 = Submission.submission_from_json('jsons/submission.json') 53 | # print(677, submission==submission2) 54 | # print(submission1.belonging_jobs) 55 | # print(local_context) 56 | -------------------------------------------------------------------------------- /tests/devel_test_lsf.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | from dpdispatcher.machine import Machine 6 | from dpdispatcher.submission import Resources, Submission, Task 7 | 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 9 | 10 | # task_need_resources has no effect 11 | with open("jsons/machine_lsf.json") as f: 12 | mdata = json.load(f) 13 | 14 | machine = Machine.load_from_dict(mdata["machine"]) 15 | resources = Resources.load_from_dict(mdata["resources"]) 16 | 17 | submission = Submission( 18 | work_base="0_md/", 19 | machine=machine, 20 | resources=resources, 21 | forward_common_files=["graph.pb"], 22 | backward_common_files=[], 23 | ) 24 | 25 | task1 = Task( 26 | command="lmp -i input.lammps", 27 | task_work_path="bct-1/", 28 | forward_files=["conf.lmp", "input.lammps"], 29 | backward_files=["log.lammps"], 30 | ) 31 | task2 = Task( 32 | command="lmp -i input.lammps", 33 | task_work_path="bct-2/", 34 | forward_files=["conf.lmp", "input.lammps"], 35 | backward_files=["log.lammps"], 36 | ) 37 | task3 = Task( 38 | command="lmp -i input.lammps", 39 | task_work_path="bct-3/", 40 | forward_files=["conf.lmp", "input.lammps"], 41 | backward_files=["log.lammps"], 42 | ) 43 | task4 = Task( 44 | command="lmp -i input.lammps", 45 | task_work_path="bct-4/", 46 | forward_files=["conf.lmp", "input.lammps"], 47 | backward_files=["log.lammps"], 48 | ) 49 | submission.register_task_list( 50 | [ 51 | task1, 52 | task2, 53 | task3, 54 | task4, 55 | ] 56 | ) 57 | submission.run_submission(clean=True) 58 | -------------------------------------------------------------------------------- /tests/devel_test_shell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 5 | 6 | from dpdispatcher.local_context import LocalContext, LocalSession 7 | 8 | # from dpdispatcher.pbs import PBS 9 | from dpdispatcher.shell import Shell 10 | from dpdispatcher.submission import Resources, Submission, Task 11 | 12 | local_session = LocalSession({"work_path": "temp2"}) 13 | local_context = LocalContext(local_root="test_shell_dir/", work_profile=local_session) 14 | # lazy_local_context = LazyLocalContext(local_root='/home/fengbo/10_dpdispatcher/dpdispatcher/tests/temp3/0_md', work_profile=None) 15 | shell = Shell(context=local_context) 16 | # pbs = PBS(context=lazy_local_context) 17 | 18 | resources = Resources( 19 | number_node=1, cpu_per_node=4, gpu_per_node=1, queue_name="V100_8_32", group_size=4 20 | ) 21 | submission = Submission( 22 | work_base="0_md", 23 | resources=resources, 24 | forward_common_files=["graph.pb"], 25 | backward_common_files=["submission.json"], 26 | ) # , batch=PBS) 27 | task1 = Task( 28 | command="lmp_serial -i input.lammps", 29 | task_work_path="bct-1", 30 | forward_files=["conf.lmp", "input.lammps"], 31 | backward_files=["log.lammps"], 32 | task_need_resources=1, 33 | ) 34 | task2 = Task( 35 | command="lmp_serial -i input.lammps", 36 | task_work_path="bct-2", 37 | forward_files=["conf.lmp", "input.lammps"], 38 | backward_files=["log.lammps"], 39 | task_need_resources=0.25, 40 | ) 41 | task3 = Task( 42 | command="lmp_serial -i input.lammps", 43 | task_work_path="bct-3", 44 | forward_files=["conf.lmp", "input.lammps"], 45 | backward_files=["log.lammps"], 46 | task_need_resources=0.25, 47 | ) 48 | task4 = Task( 49 | command="lmp_serial -i input.lammps", 50 | task_work_path="bct-4", 51 | forward_files=["conf.lmp", "input.lammps"], 52 | backward_files=["log.lammps"], 53 | task_need_resources=0.5, 54 | ) 55 | submission.register_task_list( 56 | [ 57 | task1, 58 | task2, 59 | task3, 60 | task4, 61 | ] 62 | ) 63 | submission.generate_jobs() 64 | submission.bind_batch(batch=shell) 65 | # for job in submission.belonging_jobs: 66 | # job.job_to_json() 67 | # print('111', submission) 68 | # submission2 = Submission.recover_jobs_from_json('./jr.json') 69 | # print('222', submission2) 70 | # print(submission==submission2) 71 | submission.run_submission() 72 | 73 | # submission1.dump_jobs_fo_json() 74 | # submission2 = Submission.submission_from_json('jsons/submission.json') 75 | # print(677, submission==submission2) 76 | # print(submission1.belonging_jobs) 77 | # print(local_context) 78 | -------------------------------------------------------------------------------- /tests/devel_test_slurm.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | # from .context import dpdispatcher 7 | # from dpdispatcher.local_context import LocalContext 8 | from dpdispatcher.machine import Machine 9 | from dpdispatcher.submission import Resources, Submission, Task 10 | 11 | # from dpdispatcher.ssh_context import SSHContext 12 | 13 | 14 | # from dpdispatcher.submission import 15 | # from dpdispatcher.slurm import Slurm 16 | 17 | # local_session = LocalSession({'work_path':'temp2'}) 18 | # local_context = LocalContext(local_root='test_slurm_dir/', work_profile=local_session) 19 | # lazy_local_context = LazyLocalContext(local_root='test_slurm_dir/') 20 | 21 | 22 | # machine_dict = dict(hostname='localhost', remote_root='/home/dp/dpdispatcher/tests/temp2', username='dp') 23 | # ssh_session = SSHSession(**machine_dict) 24 | # ssh_session = SSHSession(hostname='8.131.233.55', remote_root='/home/dp/dp_remote', username='dp') 25 | # ssh_context = SSHContext(local_root='test_slurm_dir', ssh_session=ssh_session) 26 | # slurm = Slurm(context=ssh_context) 27 | # slurm = Slurm(context=lazy_local_context) 28 | 29 | # resources = Resources(number_node=1, cpu_per_node=4, gpu_per_node=2, queue_name="GPU_2080Ti", group_size=4, 30 | # custom_flags=['#SBATCH --exclude=2080ti000,2080ti001,2080ti002,2080ti004,2080ti005,2080ti006'], 31 | # para_deg=2, 32 | # strategy={"if_cuda_multi_devices":True}) 33 | # slurm_sbatch_dict={'mem': '10G', 'cpus_per_task':1, 'time': "120:0:0"} 34 | # slurm_resources = SlurmResources(resources=resources, slurm_sbatch_dict=slurm_sbatch_dict) 35 | 36 | with open("jsons/machine_slurm.json") as f: 37 | mdata = json.load(f) 38 | 39 | machine = Machine.load_from_dict(mdata["machine"]) 40 | resources = Resources.load_from_dict(mdata["resources"]) 41 | 42 | submission = Submission( 43 | work_base="0_md/", 44 | machine=machine, 45 | resources=resources, 46 | forward_common_files=["graph.pb"], 47 | backward_common_files=[], 48 | ) # , batch=PBS) 49 | task1 = Task( 50 | command="lmp -i input.lammps", 51 | task_work_path="bct-1/", 52 | forward_files=["conf.lmp", "input.lammps"], 53 | backward_files=["log.lammps"], 54 | ) 55 | task2 = Task( 56 | command="lmp -i input.lammps", 57 | task_work_path="bct-2/", 58 | forward_files=["conf.lmp", "input.lammps"], 59 | backward_files=["log.lammps"], 60 | ) 61 | task3 = Task( 62 | command="lmp -i input.lammps", 63 | task_work_path="bct-3/", 64 | forward_files=["conf.lmp", "input.lammps"], 65 | backward_files=["log.lammps"], 66 | ) 67 | task4 = Task( 68 | command="lmp -i input.lammps", 69 | task_work_path="bct-4/", 70 | forward_files=["conf.lmp", "input.lammps"], 71 | backward_files=["log.lammps"], 72 | ) 73 | submission.register_task_list( 74 | [ 75 | task1, 76 | task2, 77 | task3, 78 | task4, 79 | ] 80 | ) 81 | submission.run_submission(clean=True) 82 | -------------------------------------------------------------------------------- /tests/devel_test_ssh_ali_ehpc.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | 7 | # from dpdispatcher.local_context import LocalSession 8 | # from dpdispatcher.local_context import LocalContext 9 | 10 | # from dpdispatcher.batch import Batch 11 | # from dpdispatcher.pbs import PBS 12 | from dpdispatcher.batch_object import BatchObject 13 | from dpdispatcher.submission import Resources, Submission, Task 14 | 15 | with open("ssh_machine.json") as f: 16 | jdata = json.load(f) 17 | 18 | batch = BatchObject(jdata=jdata) 19 | # local_session = LocalSession({'work_path':'temp2'}) 20 | # local_context = LocalContext(local_root='temp1/', work_profile=local_session) 21 | # lazy_local_context = LazyLocalContext(local_root='/home/fengbo/10_dpdispatcher/dpdispatcher/tests/temp3/0_md', work_profile=None) 22 | # pbs = PBS(context=lazy_local_context) 23 | # ssh_session = SSHSession(hostname='39.106.84.25', remote_root='/home/fengbo/dp_remote', username='fengbo') 24 | # ssh_context = SSHContext(local_root='test_slurm_dir', ssh_session=ssh_session) 25 | # jdata = 26 | 27 | # pbs = PBS(context=ssh_context) 28 | 29 | resources = Resources( 30 | number_node=1, cpu_per_node=4, gpu_per_node=1, queue_name="V100_8_32", group_size=4 31 | ) 32 | submission = Submission( 33 | work_base="test_pbs_dir/0_md", 34 | resources=resources, 35 | forward_common_files=["graph.pb"], 36 | backward_common_files=[], 37 | ) # , batch=PBS) 38 | task1 = Task( 39 | command="lmp_serial -i input.lammps", 40 | task_work_path="bct-1", 41 | forward_files=["conf.lmp", "input.lammps"], 42 | backward_files=["log.lammps"], 43 | ) 44 | task2 = Task( 45 | command="lmp_serial -i input.lammps", 46 | task_work_path="bct-2", 47 | forward_files=["conf.lmp", "input.lammps"], 48 | backward_files=["log.lammps"], 49 | ) 50 | task3 = Task( 51 | command="lmp_serial -i input.lammps", 52 | task_work_path="bct-3", 53 | forward_files=["conf.lmp", "input.lammps"], 54 | backward_files=["log.lammps"], 55 | ) 56 | task4 = Task( 57 | command="lmp_serial -i input.lammps", 58 | task_work_path="bct-4", 59 | forward_files=["conf.lmp", "input.lammps"], 60 | backward_files=["log.lammps"], 61 | ) 62 | submission.register_task_list( 63 | [ 64 | task1, 65 | task2, 66 | task3, 67 | task4, 68 | ] 69 | ) 70 | submission.generate_jobs() 71 | submission.bind_batch(batch=batch) 72 | # for job in submission.belonging_jobs: 73 | # job.job_to_json() 74 | # print('111', submission) 75 | # submission2 = Submission.recover_jobs_from_json('./jr.json') 76 | # print('222', submission2) 77 | # print(submission==submission2) 78 | submission.run_submission() 79 | 80 | # submission1.dump_jobs_fo_json() 81 | # submission2 = Submission.submission_from_json('jsons/submission.json') 82 | # print(677, submission==submission2) 83 | # print(submission1.belonging_jobs) 84 | # print(local_context) 85 | -------------------------------------------------------------------------------- /tests/graph.pb: -------------------------------------------------------------------------------- 1 | # mock file 2 | # origin file Sn SCAN functional label:continue-2/000 by yfb222333@gmail.com; github:felix5572 3 | # model version 1.2: wget https://deepmd-kit.oss-cn-beijing.aliyuncs.com/graph.pb 4 | # model version 2.0: wget https://deepmd-kit.oss-cn-beijing.aliyuncs.com/graph_Sn_convert_from_1.2_to_2.0.pb 5 | -------------------------------------------------------------------------------- /tests/hello_world.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # # dpdispatcher doesn't use `requires-python` and `dependencies` 3 | # requires-python = ">=3" 4 | # dependencies = [ 5 | # ] 6 | # [tool.dpdispatcher] 7 | # work_base = "./" 8 | # forward_common_files=[] 9 | # backward_common_files=[] 10 | # [tool.dpdispatcher.machine] 11 | # batch_type = "Shell" 12 | # local_root = "./" 13 | # context_type = "LazyLocalContext" 14 | # [tool.dpdispatcher.resources] 15 | # number_node = 1 16 | # cpu_per_node = 1 17 | # gpu_per_node = 0 18 | # group_size = 0 19 | # [[tool.dpdispatcher.task_list]] 20 | # # no need to contain the script filename 21 | # command = "python" 22 | # # can be a glob pattern 23 | # task_work_path = "./" 24 | # forward_files = [] 25 | # backward_files = ["log"] 26 | # /// 27 | 28 | print("hello world!") 29 | -------------------------------------------------------------------------------- /tests/jsons/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "bc1a7297489e921034ced5036cb23ef9daf7b681": { 3 | "job_task_list": [ 4 | { 5 | "command": "lmp -i input.lammps", 6 | "task_work_path": "bct-3/", 7 | "forward_files": [ 8 | "conf.lmp", 9 | "input.lammps" 10 | ], 11 | "backward_files": [ 12 | "log.lammps" 13 | ], 14 | "outlog": "log", 15 | "errlog": "err" 16 | }, 17 | { 18 | "command": "lmp -i input.lammps", 19 | "task_work_path": "bct-2/", 20 | "forward_files": [ 21 | "conf.lmp", 22 | "input.lammps" 23 | ], 24 | "backward_files": [ 25 | "log.lammps" 26 | ], 27 | "outlog": "log", 28 | "errlog": "err" 29 | } 30 | ], 31 | "resources": { 32 | "number_node": 1, 33 | "cpu_per_node": 4, 34 | "gpu_per_node": 1, 35 | "queue_name": "T4_4_15", 36 | "group_size": 2, 37 | "custom_flags": [], 38 | "strategy": { 39 | "if_cuda_multi_devices": false 40 | }, 41 | "para_deg": 1, 42 | "module_unload_list": [], 43 | "module_list": [], 44 | "source_list": [], 45 | "envs": {}, 46 | "kwargs": {} 47 | }, 48 | "job_state": null, 49 | "job_id": "", 50 | "fail_count": 0 51 | } 52 | } -------------------------------------------------------------------------------- /tests/jsons/machine.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "PBS", 4 | "context_type": "LocalContext", 5 | "local_root" : "./", 6 | "remote_root" : "./", 7 | "remote_profile": {} 8 | }, 9 | "resources":{ 10 | "number_node": 1, 11 | "cpu_per_node": 4, 12 | "gpu_per_node": 1, 13 | "queue_name": "T4_4_15", 14 | "group_size": 5 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tests/jsons/machine_JH_UniScheduler.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine": { 3 | "batch_type": "JH_UniScheduler", 4 | "context_type": "local", 5 | "local_root": "./", 6 | "remote_root": "/data/home/wangsimin/machine_learning/dpgen/task/test/dpgen_example/run1" 7 | }, 8 | "resources":{ 9 | "number_node": 1, 10 | "cpu_per_node": 4, 11 | "gpu_per_node": 1, 12 | "queue_name": "gpu", 13 | "group_size": 4, 14 | "source_list": ["/public/software/deepmd-kit/bin/activate /public/software/deepmd-kit"] 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tests/jsons/machine_ali_ehpc.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "PBS", 4 | "context_type": "SSHContext", 5 | "local_root": "./test_context_dir", 6 | "remote_root": "/home/yuanfengbo/work_path_dpdispatcher_test", 7 | "remote_profile": { 8 | "hostname": "39.103.186.143", 9 | "username": "yuanfengbo" 10 | } 11 | }, 12 | "resources":{ 13 | "number_node": 1, 14 | "cpu_per_node": 8, 15 | "gpu_per_node": 1, 16 | "queue_name": "V100_8_32", 17 | "group_size": 5 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /tests/jsons/machine_center.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_lazy_pbs":{ 3 | "batch_type": "pbs", 4 | "context_type": "lazy_local", 5 | "local_root": "./" 6 | }, 7 | "batch_local_slurm":{ 8 | "batch_type": "slurm", 9 | "context_type": "local", 10 | "local_root": "./", 11 | "remote_root": "./" 12 | }, 13 | "batch_ehpc_ssh_pbs":{ 14 | "batch_type":"pbs", 15 | "context_type":"ssh", 16 | "local_root": "./", 17 | "remote_root": "~/dpdispatcher_work_dir", 18 | "hostname": "39.106.xx.xxx", 19 | "username": "user1" 20 | }, 21 | "resources_gpu":{ 22 | "number_node": 1, 23 | "cpu_per_node": 4, 24 | "gpu_per_node": 1, 25 | "queue_name": "T4_4_14", 26 | "group_size": 5 27 | }, 28 | "resources_cpu":{ 29 | "number_node": 1, 30 | "cpu_per_node": 8, 31 | "gpu_per_node": 0, 32 | "queue_name": "C32_64", 33 | "group_size": 5 34 | } 35 | } -------------------------------------------------------------------------------- /tests/jsons/machine_diffenert.json: -------------------------------------------------------------------------------- 1 | { 2 | "train":{ 3 | "batch":{ 4 | "batch_type": "pbs", 5 | "context_type": "lazy_local", 6 | "local_root": "./" 7 | }, 8 | "resources":{ 9 | "number_node": 1, 10 | "cpu_per_node": 4, 11 | "gpu_per_node": 1, 12 | "queue_name": "T4_4_14", 13 | "group_size": 5 14 | } 15 | }, 16 | "md":{ 17 | "batch":{ 18 | "batch_type": "Shell", 19 | "context_type": "lazy_local", 20 | "local_root": "./" 21 | }, 22 | "resources":{ 23 | "number_node": null, 24 | "cpu_per_node": null, 25 | "gpu_per_node": 0, 26 | "queue_name": null, 27 | "group_size": 5 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /tests/jsons/machine_dp_cloud_server.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "DpCloudServer", 4 | "context_type": "DpCloudServerContext", 5 | "local_root" : "./test_context_dir/", 6 | "remote_profile":{ 7 | "username": "yfb222333", 8 | "password": "", 9 | "input_data":{ 10 | "job_type": "indicate", 11 | "log_file": "dp_cloud_server.log", 12 | "command": "", 13 | "backward_files": [], 14 | "job_name": "dpgen_20210604_job", 15 | "machine": { 16 | "platform": "ali", 17 | "resources": { 18 | "gpu_type": "1 * NVIDIA P100", 19 | "cpu_num": 4, 20 | "mem_limit": 28, 21 | "disk_size": 100, 22 | "region": "cn-beijing", 23 | "time_limit": "2:00:00", 24 | "image_name": "yfb-deepmd-kit-1.2.4-cuda10" 25 | } 26 | }, 27 | "job_resources": null 28 | } 29 | } 30 | }, 31 | "resources": { 32 | "number_node": 1, 33 | "cpu_per_node": 4, 34 | "gpu_per_node": 1, 35 | "queue_name": "GPU", 36 | "group_size": 5 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/jsons/machine_fugaku.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine": { 3 | "batch_type": "fugaku", 4 | "context_type": "SSHContext", 5 | "local_root" : "./", 6 | "remote_profile": { 7 | "hostname": "login.****.jp", 8 | "key_filename":"/home/***/.ssh/***", 9 | "passphrase":"******", 10 | "username": "u*****" 11 | }, 12 | "remote_root": "/vol*****/data/****" 13 | }, 14 | "resources": { 15 | "number_node": 1, 16 | "cpu_per_node": 48, 17 | "source_list": [""], 18 | "queue_name": "small", 19 | "group_size": 1, 20 | "custom_flags" : ["#PJM -L \"elapse=4:00:00\"", 21 | "#PJM -x PJM_LLIO_GFSCACHE=/vol0004", 22 | "#PJM -g hp******"] 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/jsons/machine_if_cuda_multi_devices.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "Shell", 4 | "context_type": "LocalContext", 5 | "local_root" : "test_if_cuda_multi_devices/", 6 | "remote_root" : "tmp_if_cuda_multi_devices/", 7 | "remote_profile":{} 8 | }, 9 | "resources":{ 10 | "number_node": 1, 11 | "cpu_per_node": 4, 12 | "gpu_per_node": 4, 13 | "queue_name": "GPU_2080Ti", 14 | "group_size": 16, 15 | "para_deg": 2, 16 | "strategy": { 17 | "if_cuda_multi_devices": true 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/jsons/machine_lazy_local_jh_unischeduler.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine": { 3 | "batch_type": "JH_UniScheduler", 4 | "context_type": "LazyLocalContext", 5 | "local_root": "./test_jh_unischeduler" 6 | }, 7 | "resources": { 8 | "number_node": 1, 9 | "cpu_per_node": 4, 10 | "queue_name": "gpu", 11 | "gpu_per_node": 1, 12 | "group_size": 4, 13 | "strategy": { 14 | "if_cuda_multi_devices": false 15 | }, 16 | "source_list": ["./slurm_test.env"] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/jsons/machine_lazy_local_lsf.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "LSF", 4 | "context_type": "LazyLocalContext", 5 | "local_root": "./test_lsf_dir" 6 | }, 7 | "resources":{ 8 | "number_node": 1, 9 | "cpu_per_node": 4, 10 | "gpu_per_node": 1, 11 | "queue_name": "gpu", 12 | "group_size": 4, 13 | "custom_flags": [ 14 | "#BSUB -R \"select[hname != g005]\"", 15 | "#BSUB -W 24:00" 16 | ], 17 | "strategy": { 18 | "if_cuda_multi_devices": false 19 | }, 20 | "para_deg": 1, 21 | "module_unload_list": [], 22 | "module_purge": true, 23 | "module_list": [ 24 | "use.own", 25 | "deepmd/1.3" 26 | ], 27 | "source_list": [ 28 | "/data/home/ypliu/scripts/avail_gpu.sh", 29 | "/data/home/ypliu/dprun/tf_envs.sh" 30 | ], 31 | "envs": {"DP_DISPATCHER_EXPORT": "test_foo_bar_baz"}, 32 | "prepend_script": [ 33 | "echo 'The summer you were there.'" 34 | ], 35 | "append_script": [ 36 | "echo 'shizuku'", 37 | "echo 'kaori'" 38 | ], 39 | "kwargs": { 40 | "gpu_usage": true, 41 | "gpu_new_syntax": true, 42 | "gpu_exclusive": false 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tests/jsons/machine_lazy_local_slurm.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "Slurm", 4 | "context_type": "LazyLocalContext", 5 | "local_root": "./test_slurm_dir" 6 | }, 7 | "resources":{ 8 | "number_node": 1, 9 | "cpu_per_node": 4, 10 | "gpu_per_node": 2, 11 | "queue_name": "GPU_2080Ti", 12 | "group_size": 4, 13 | "custom_flags": ["#SBATCH --nice=100", "#SBATCH --time=24:00:00"], 14 | "strategy": { 15 | "if_cuda_multi_devices": true 16 | }, 17 | "para_deg": 2, 18 | "module_unload_list": ["singularity"], 19 | "module_list": ["singularity/3.0.0"], 20 | "source_list": ["./slurm_test.env"], 21 | "envs": {"DP_DISPATCHER_EXPORT_VAR": "test_foo_bar_baz"}, 22 | "custom_gpu_line":"#SBATCH --gres=gpu:2080Ti:2" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/jsons/machine_lazylocal_shell.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "Shell", 4 | "context_type": "LazyLocalContext", 5 | "local_root": "./test_shell_trival_dir" 6 | }, 7 | "resources":{ 8 | "number_node": 1, 9 | "cpu_per_node": 4, 10 | "gpu_per_node": 0, 11 | "queue_name": "CPU", 12 | "group_size": 2 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tests/jsons/machine_local_fugaku.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine": { 3 | "batch_type": "fugaku", 4 | "context_type": "local", 5 | "local_root" : "./", 6 | "remote_root": "./" 7 | }, 8 | "resources": { 9 | "number_node": 1, 10 | "cpu_per_node": 48, 11 | "source_list": [""], 12 | "queue_name": "small", 13 | "group_size": 1, 14 | "custom_flags" : ["#PJM -L \"elapse=4:00:00\"", 15 | "#PJM -x PJM_LLIO_GFSCACHE=/vol0004", 16 | "#PJM -g hp******"] 17 | } 18 | } -------------------------------------------------------------------------------- /tests/jsons/machine_local_shell.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "Shell", 4 | "context_type": "LocalContext", 5 | "local_root": "./test_shell_trival_dir", 6 | "remote_root": "./tmp_shell_trival_dir" 7 | }, 8 | "resources":{ 9 | "number_node": 1, 10 | "cpu_per_node": 4, 11 | "gpu_per_node": 0, 12 | "queue_name": "CPU", 13 | "group_size": 2 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/jsons/machine_lsf.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "LSF", 4 | "context_type": "SSHContext", 5 | "local_root": "./test_lsf_dir", 6 | "remote_root": "/data/home/ypliu/dptasks", 7 | "remote_profile": { 8 | "hostname": "123.45.78.99", 9 | "port": 56789, 10 | "username": "ypliu" 11 | } 12 | }, 13 | "resources":{ 14 | "number_node": 1, 15 | "cpu_per_node": 4, 16 | "gpu_per_node": 1, 17 | "queue_name": "gpu", 18 | "group_size": 4, 19 | "custom_flags": [ 20 | "#BSUB -R \"select[hname != g005]\"", 21 | "#BSUB -W 24:00" 22 | ], 23 | "strategy": { 24 | "if_cuda_multi_devices": false 25 | }, 26 | "para_deg": 1, 27 | "module_unload_list": [], 28 | "module_list": [ 29 | "use.own", 30 | "deepmd/1.3" 31 | ], 32 | "source_list": [ 33 | "/data/home/ypliu/scripts/avail_gpu.sh", 34 | "/data/home/ypliu/dprun/tf_envs.sh" 35 | ], 36 | "envs": {"DP_DISPATCHER_EXPORT": "test_foo_bar_baz"}, 37 | "kwargs": { 38 | "gpu_usage": true, 39 | "gpu_new_syntax": true 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/jsons/machine_openapi.json: -------------------------------------------------------------------------------- 1 | { 2 | "command": "lmp -i in.lammps -v restart 0", 3 | "machine": { 4 | "batch_type": "OpenAPI", 5 | "context_type": "OpenAPIContext", 6 | "remote_profile": { 7 | "project_id": 154, 8 | "command": "lmp -i in.lammps -v restart 0", 9 | "job_name": "dpgen_lammps_job", 10 | "machine_type":"c4_m15_1 * NVIDIA T4", 11 | "image_address":"registry.dp.tech/dev/test/deepmd-kit:2.2.1-cuda11.6" 12 | } 13 | }, 14 | "resources": { 15 | "group_size": 10 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/jsons/machine_slurm.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine":{ 3 | "batch_type": "Slurm", 4 | "context_type": "SSHContext", 5 | "local_root": "./test_context_dir", 6 | "remote_root": "/home/fengbo/work_path_dpdispatcher_test", 7 | "remote_profile": { 8 | "hostname": "xxx.200.xxx.59", 9 | "username": "fengbo" 10 | } 11 | }, 12 | "resources":{ 13 | "number_node": 1, 14 | "cpu_per_node": 4, 15 | "gpu_per_node": 2, 16 | "queue_name": "GPU_2080Ti", 17 | "group_size": 4, 18 | "custom_flags": ["#SBATCH --nice=100", "#SBATCH --time=24:00:00"], 19 | "strategy": { 20 | "if_cuda_multi_devices": true 21 | }, 22 | "para_deg": 2, 23 | "module_unload_list": ["singularity"], 24 | "module_list": ["singularity/3.0.0"], 25 | "source_list": ["./slurm_test.env"], 26 | "envs": {"DP_DISPATCHER_EXPORT": "test_foo_bar_baz"}, 27 | "kwargs": {} 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tests/jsons/machine_yarn.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine": { 3 | "batch_type": "DistributedShell", 4 | "context_type": "HDFSContext", 5 | "local_root": "./test_hdfs_dir", 6 | "remote_root": "/user/jenny/md/local_test" 7 | }, 8 | "resources": { 9 | "number_node": 1, 10 | "allow_failure": true, 11 | "ratio_failue": 0.05, 12 | "cpu_per_node": 32, 13 | "gpu_per_node": 0, 14 | "with_mpi": true, 15 | "queue_name": "root.oryx_bigbang", 16 | "cluster": "oryx", 17 | "group_size": 1, 18 | "source_list": ["/opt/intel/oneapi/setvars.sh"], 19 | "kwargs": { 20 | "yarn_path": "/opt/tiger/yarn_deploy/hadoop/share/hadoop/yarn", 21 | "img_name": "hub.byted.org/md/dpgen_fp:v1", 22 | "mem_limit": 64 23 | }, 24 | "envs" : {"HADOOP_HOME" : "${HADOOP_HOME:-/opt/tiger/yarn_deploy/hadoop}", 25 | "JAVA_HOME": "${JAVA_HOME:-/opt/tiger/jdk/jdk1.8}", 26 | "LD_LIBRARY_PATH": "${LD_LIBRARY_PATH}:${HADOOP_HOME}/lib/native:${JAVA_HOME}/jre/lib/amd64/server", 27 | "CLASSPATH": "`${HADOOP_HOME}/bin/hadoop classpath --glob`", 28 | "PATH": "${HADOOP_HOME}/bin:/opt/vasp.5.4.4/bin:${PATH}"} 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tests/jsons/resources.json: -------------------------------------------------------------------------------- 1 | { 2 | "number_node": 1, 3 | "cpu_per_node": 4, 4 | "gpu_per_node": 1, 5 | "queue_name": "T4_4_15", 6 | "group_size": 2, 7 | "custom_flags": [], 8 | "strategy": { 9 | "if_cuda_multi_devices": false 10 | }, 11 | "para_deg": 1, 12 | "module_unload_list": [], 13 | "module_list": [], 14 | "source_list": [], 15 | "envs": {}, 16 | "wait_time": 0, 17 | "kwargs": {} 18 | } -------------------------------------------------------------------------------- /tests/jsons/task.json: -------------------------------------------------------------------------------- 1 | { 2 | "command": "lmp -i input.lammps", 3 | "task_work_path": "bct-1/", 4 | "forward_files": [ 5 | "conf.lmp", 6 | "input.lammps" 7 | ], 8 | "backward_files": [ 9 | "log.lammps" 10 | ], 11 | "outlog": "log", 12 | "errlog": "err" 13 | } -------------------------------------------------------------------------------- /tests/script_gen_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import sys 5 | 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 7 | __package__ = "tests" 8 | # from .context import LocalContext 9 | 10 | from .sample_class import SampleClass 11 | 12 | task_dict = SampleClass.get_sample_task_dict() 13 | assert os.path.isfile("jsons/task.json") is False 14 | with open("jsons/task.json", "w") as f: 15 | json.dump(task_dict, f, indent=4) 16 | 17 | resources_dict = SampleClass.get_sample_resources_dict() 18 | assert os.path.isfile("jsons/resources.json") is False 19 | with open("jsons/resources.json", "w") as f: 20 | json.dump(resources_dict, f, indent=4) 21 | 22 | pbs = SampleClass.get_sample_pbs_local_context() 23 | submission = SampleClass.get_sample_submission() 24 | submission.bind_machine(machine=pbs) 25 | assert os.path.isfile("jsons/submission.json") is False 26 | with open("jsons/submission.json", "w") as f: 27 | json.dump(submission.serialize(), f, indent=4) 28 | 29 | job_dict = SampleClass.get_sample_job_dict() 30 | assert os.path.isfile("jsons/job.json") is False 31 | with open("jsons/job.json", "w") as f: 32 | json.dump(job_dict, f, indent=4) 33 | -------------------------------------------------------------------------------- /tests/slurm_test.env: -------------------------------------------------------------------------------- 1 | export DP_DPDISPATCHER_TEST_VAR="dpdispatcher_foo_bar" 2 | -------------------------------------------------------------------------------- /tests/test_argcheck.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | 8 | from .context import Machine, Resources, Task, setUpModule # noqa: F401 9 | 10 | 11 | class TestJob(unittest.TestCase): 12 | def test_machine_argcheck(self): 13 | norm_dict = Machine.load_from_dict( 14 | { 15 | "batch_type": "slurm", 16 | "context_type": "local", 17 | "local_root": "./", 18 | "remote_root": "/some/path", 19 | } 20 | ).serialize() 21 | expected_dict = { 22 | "batch_type": "Slurm", 23 | "context_type": "LocalContext", 24 | "local_root": "./", 25 | "remote_root": "/some/path", 26 | "remote_profile": { 27 | "symlink": True, 28 | }, 29 | "clean_asynchronously": False, 30 | } 31 | self.assertDictEqual(norm_dict, expected_dict) 32 | 33 | def test_resources_argcheck(self): 34 | norm_dict = Resources.load_from_dict( 35 | { 36 | "number_node": 1, 37 | "cpu_per_node": 2, 38 | "gpu_per_node": 0, 39 | "queue_name": "haha", 40 | "group_size": 1, 41 | "envs": { 42 | "aa": "bb", 43 | }, 44 | "kwargs": { 45 | "cc": True, 46 | }, 47 | } 48 | ).serialize() 49 | expected_dict = { 50 | "append_script": [], 51 | "cpu_per_node": 2, 52 | "custom_flags": [], 53 | "envs": {"aa": "bb"}, 54 | "gpu_per_node": 0, 55 | "group_size": 1, 56 | "kwargs": { 57 | "cc": True, 58 | }, 59 | "module_list": [], 60 | "module_purge": False, 61 | "module_unload_list": [], 62 | "number_node": 1, 63 | "para_deg": 1, 64 | "prepend_script": [], 65 | "queue_name": "haha", 66 | "source_list": [], 67 | "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, 68 | "wait_time": 0, 69 | } 70 | self.assertDictEqual(norm_dict, expected_dict) 71 | 72 | def test_task_argcheck(self): 73 | norm_dict = Task.load_from_dict( 74 | { 75 | "command": "ls", 76 | "task_work_path": "./", 77 | "forward_files": [], 78 | "backward_files": [], 79 | "outlog": "out", 80 | "errlog": "err", 81 | } 82 | ).serialize() 83 | expected_dict = { 84 | "command": "ls", 85 | "task_work_path": "./", 86 | "forward_files": [], 87 | "backward_files": [], 88 | "outlog": "out", 89 | "errlog": "err", 90 | } 91 | self.assertDictEqual(norm_dict, expected_dict) 92 | -------------------------------------------------------------------------------- /tests/test_class_job.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | # from .context import LocalSession 8 | # from .context import LocalContext 9 | from .context import ( 10 | Job, 11 | Submission, 12 | setUpModule, # noqa: F401 13 | ) 14 | from .sample_class import SampleClass 15 | 16 | 17 | class TestJob(unittest.TestCase): 18 | def setUp(self): 19 | self.job = SampleClass.get_sample_job() 20 | 21 | self.submission2 = Submission.submission_from_json("jsons/submission.json") 22 | self.job2 = self.submission2.belonging_jobs[0] 23 | 24 | def test_eq(self): 25 | self.assertTrue(self.job == self.job2) 26 | 27 | def test_get_hash(self): 28 | self.assertEqual(self.job.get_hash(), self.job2.get_hash()) 29 | # self.assertEqual(self.submission, self.submission2) 30 | 31 | def test_serialize_deserialize(self): 32 | self.assertEqual(self.job, Job.deserialize(job_dict=self.job.serialize())) 33 | 34 | def test_static_serialize(self): 35 | self.assertNotIn( 36 | "job_state", list(self.job.serialize(if_static=True).values())[0] 37 | ) 38 | self.assertNotIn("job_id", list(self.job.serialize(if_static=True).values())[0]) 39 | self.assertNotIn( 40 | "fail_count", list(self.job.serialize(if_static=True).values())[0] 41 | ) 42 | 43 | def test_get_job_state(self): 44 | pass 45 | 46 | def test_handle_unexpected_job_state(self): 47 | pass 48 | 49 | def test_register_job_id(self): 50 | pass 51 | 52 | def test_submit_job(self): 53 | pass 54 | 55 | def test_job_to_json(self): 56 | pass 57 | 58 | 59 | # def test_content_serialize(self): 60 | # self.assertEqual(self.job.content_serialize(), self.job.serialize()[self.job.job_hash]) 61 | -------------------------------------------------------------------------------- /tests/test_class_machine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | 8 | from .context import ( 9 | PBS, 10 | Machine, 11 | setUpModule, # noqa: F401 12 | ) 13 | from .sample_class import SampleClass 14 | 15 | 16 | class TestMachineInit(unittest.TestCase): 17 | def setUp(self): 18 | self.maxDiff = None 19 | 20 | def test_machine_serialize_deserialize(self): 21 | pbs = SampleClass.get_sample_pbs_local_context() 22 | self.assertEqual(pbs, Machine.deserialize(pbs.serialize())) 23 | 24 | def test_machine_load_from_dict(self): 25 | pbs = SampleClass.get_sample_pbs_local_context() 26 | self.assertEqual(pbs, PBS.load_from_dict(pbs.serialize())) 27 | -------------------------------------------------------------------------------- /tests/test_class_resources.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 7 | __package__ = "tests" 8 | # from .context import LocalSession 9 | # from .context import LocalContext 10 | from .context import ( 11 | Resources, 12 | setUpModule, # noqa: F401 13 | ) 14 | from .sample_class import SampleClass 15 | 16 | 17 | class TestResources(unittest.TestCase): 18 | def setUp(self): 19 | self.maxDiff = None 20 | self.resources = SampleClass.get_sample_resources() 21 | self.resources_dict = SampleClass.get_sample_resources_dict() 22 | 23 | def test_eq(self): 24 | self.assertEqual(self.resources, SampleClass.get_sample_resources()) 25 | 26 | def test_serialize(self): 27 | self.assertEqual(self.resources.serialize(), self.resources_dict) 28 | 29 | def test_deserialize(self): 30 | resources = Resources.deserialize(resources_dict=self.resources_dict) 31 | self.assertEqual(self.resources, resources) 32 | 33 | def test_serialize_deserialize(self): 34 | self.assertEqual( 35 | self.resources, 36 | Resources.deserialize(resources_dict=self.resources.serialize()), 37 | ) 38 | 39 | def test_resources_json(self): 40 | with open("jsons/resources.json") as f: 41 | resources_json_dict = json.load(f) 42 | self.assertTrue(resources_json_dict, self.resources_dict) 43 | self.assertTrue(resources_json_dict, self.resources.serialize()) 44 | 45 | def test_arginfo(self): 46 | self.resources.arginfo() 47 | 48 | def test_load_from_json(self): 49 | resources = Resources.load_from_json("jsons/resources.json") 50 | self.assertTrue(resources, self.resources) 51 | -------------------------------------------------------------------------------- /tests/test_class_submission.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import unittest 5 | from unittest.mock import MagicMock, patch 6 | 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 8 | __package__ = "tests" 9 | from .context import ( 10 | JobStatus, 11 | Submission, 12 | setUpModule, # noqa: F401 13 | ) 14 | from .sample_class import SampleClass 15 | 16 | 17 | class TestSubmission(unittest.TestCase): 18 | def setUp(self): 19 | self.maxDiff = None 20 | pbs = SampleClass.get_sample_pbs_local_context() 21 | self.submission = SampleClass.get_sample_submission() 22 | self.submission.bind_machine(machine=pbs) 23 | 24 | # self.submission2 = Submission.submission_from_json('jsons/submission.json') 25 | # self.submission2 = Submission.submission_from_json('jsons/submission.json') 26 | 27 | def test_serialize_deserialize(self): 28 | self.assertEqual( 29 | self.submission.serialize(), 30 | Submission.deserialize( 31 | submission_dict=self.submission.serialize() 32 | ).serialize(), 33 | ) 34 | 35 | def test_get_hash(self): 36 | pass 37 | 38 | def test_bind_machine(self): 39 | self.assertIsNotNone(self.submission.machine.context.submission) 40 | for job in self.submission.belonging_jobs: 41 | self.assertIsNotNone(job.machine) 42 | 43 | def test_get_submision_state(self): 44 | pass 45 | 46 | def test_handle_unexpected_submission_state(self): 47 | pass 48 | 49 | def test_submit_submission(self): 50 | pass 51 | 52 | def test_upload_jobs(self): 53 | pass 54 | 55 | def test_download_jobs(self): 56 | pass 57 | 58 | def test_submission_to_json(self): 59 | pass 60 | 61 | @patch("dpdispatcher.Submission.submission_to_json") 62 | @patch("dpdispatcher.Submission.update_submission_state") 63 | def test_check_all_finished( 64 | self, patch_update_submission_state, patch_submission_to_json 65 | ): 66 | patch_update_submission_state = MagicMock(return_value=None) 67 | patch_submission_to_json = MagicMock(return_value=None) 68 | 69 | self.submission.belonging_jobs[0].job_state = JobStatus.running 70 | self.submission.belonging_jobs[1].job_state = JobStatus.waiting 71 | self.assertFalse(self.submission.check_all_finished()) 72 | 73 | self.submission.belonging_jobs[0].job_state = JobStatus.finished 74 | self.submission.belonging_jobs[1].job_state = JobStatus.unsubmitted 75 | self.assertFalse(self.submission.check_all_finished()) 76 | 77 | self.submission.belonging_jobs[0].job_state = JobStatus.completing 78 | self.submission.belonging_jobs[1].job_state = JobStatus.finished 79 | self.assertFalse(self.submission.check_all_finished()) 80 | 81 | self.submission.belonging_jobs[0].job_state = JobStatus.finished 82 | self.submission.belonging_jobs[1].job_state = JobStatus.unknown 83 | self.assertFalse(self.submission.check_all_finished()) 84 | 85 | self.submission.belonging_jobs[0].job_state = JobStatus.finished 86 | self.submission.belonging_jobs[1].job_state = JobStatus.finished 87 | self.assertTrue(self.submission.check_all_finished()) 88 | 89 | def test_submission_from_json(self): 90 | submission2 = Submission.submission_from_json("jsons/submission.json") 91 | # print('<<<<<<<', self.submission) 92 | # print('>>>>>>>', submission2) 93 | self.assertEqual(self.submission.serialize(), submission2.serialize()) 94 | 95 | def test_submission_json(self): 96 | with open("jsons/submission.json") as f: 97 | submission_json_dict = json.load(f) 98 | self.assertTrue(submission_json_dict, self.submission.serialize()) 99 | 100 | def test_try_recover_from_json(self): 101 | pass 102 | 103 | def test_repr(self): 104 | submission_repr = repr(self.submission) 105 | j = json.dumps(self.submission.serialize(), indent=4) 106 | self.assertEqual(submission_repr, j) 107 | # self.submission_to_json() 108 | 109 | def test_clean(self): 110 | pass 111 | -------------------------------------------------------------------------------- /tests/test_class_submission_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | from .context import setUpModule # noqa: F401 8 | from .sample_class import SampleClass 9 | 10 | # print('in', SampleClass.get_sample_empty_submission()) 11 | 12 | 13 | class TestSubmissionInit(unittest.TestCase): 14 | def setUp(self): 15 | self.maxDiff = None 16 | # self.empty_submission = SampleClass.get_sample_empty_submission() 17 | # print('TestSubmissionInit.setUp:self.empty_submission.belonging_tasks', self.empty_submission.belonging_tasks) 18 | 19 | def test_reigister_task(self): 20 | empty_submission = SampleClass.get_sample_empty_submission() 21 | task = SampleClass.get_sample_task() 22 | # print('TestSubmissionInit.test_reigister_task:self.empty_submission.belonging_tasks', empty_submission.belonging_tasks) 23 | empty_submission.register_task(task=task) 24 | # print('7890809', SampleClass.get_sample_empty_submission().belonging_tasks) 25 | self.assertEqual([task], empty_submission.belonging_tasks) 26 | 27 | def test_reigister_task_whether_copy(self): 28 | empty_submission = SampleClass.get_sample_empty_submission() 29 | task = SampleClass.get_sample_task() 30 | empty_submission.register_task(task=task) 31 | empty_submission2 = SampleClass.get_sample_empty_submission() 32 | self.assertEqual(empty_submission2.belonging_tasks, []) 33 | 34 | # empty_submission = 35 | 36 | # def test_reigister_task_list(self): 37 | # pass 38 | 39 | 40 | # print('out', SampleClass.get_sample_empty_submission()) 41 | # print('TestSubmissionInit.test_register_task_list:task_list', task_list) 42 | # empty_submission = SampleClass.get_sample_empty_submission() 43 | # task_list = SampleClass.get_sample_task_list() 44 | # empty_submission.register_task_list(task_list=task_list) 45 | # self.empty_submission.register_task_list(task_list=task_list) 46 | # self.assertEqual(task_list, empty_submission.belonging_tasks) 47 | 48 | # def tesk_generate_jobs(self): 49 | # task_list = SampleClass.get_sample_task_list() 50 | # self.submission.register_task_list(task_list=task_list) 51 | # self.submission.generate_jobs() 52 | # task1, task2, task3, task4 = task_list 53 | # task_ll = [job.job_task_list for job in self.submission.belonging_jobs] 54 | # self.assertEqual([[task3, task2], [task4, task1]], task_ll) 55 | -------------------------------------------------------------------------------- /tests/test_class_task.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 7 | __package__ = "tests" 8 | # from .context import LocalContext 9 | 10 | # from .context import Dispatcher 11 | from .context import ( 12 | Task, 13 | setUpModule, # noqa: F401 14 | ) 15 | from .sample_class import SampleClass 16 | 17 | 18 | class TestTask(unittest.TestCase): 19 | def setUp(self): 20 | self.task = SampleClass.get_sample_task() 21 | self.task_dict = SampleClass.get_sample_task_dict() 22 | 23 | def test_serialize(self): 24 | self.assertEqual(self.task.serialize(), self.task_dict) 25 | 26 | def test_deserialize(self): 27 | task = Task.deserialize(task_dict=self.task_dict) 28 | self.assertTrue(task, self.task) 29 | 30 | def test_serialize_deserialize(self): 31 | self.assertEqual(Task.deserialize(task_dict=self.task.serialize()), self.task) 32 | 33 | def test_task_json(self): 34 | with open("jsons/task.json") as f: 35 | task_json_dict = json.load(f) 36 | self.assertTrue(task_json_dict, self.task_dict) 37 | self.assertTrue(task_json_dict, self.task.serialize()) 38 | 39 | def test_repr(self): 40 | task_repr = repr(self.task) 41 | print("debug:", task_repr, self.task_dict) 42 | self.assertEqual(task_repr, str(self.task_dict)) 43 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import subprocess as sp 2 | import unittest 3 | 4 | 5 | class TestCLI(unittest.TestCase): 6 | def test_cli(self): 7 | sp.check_output(["dpdisp", "-h"]) 8 | for subcommand in ( 9 | "submission", 10 | "gui", 11 | "run", 12 | ): 13 | sp.check_output(["dpdisp", subcommand, "-h"]) 14 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-1/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-1/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-1/some_dir/some_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/tests/test_context_dir/0_md/bct-1/some_dir/some_file -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-2/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-2/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-3/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-3/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-4/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/bct-4/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/dir with space/file with space: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/tests/test_context_dir/0_md/dir with space/file with space -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_context_dir/0_md/some_dir/some_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/tests/test_context_dir/0_md/some_dir/some_file -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | """This module ensures input in the examples directory 2 | could pass the argument checking. 3 | """ 4 | 5 | import json 6 | import unittest 7 | from pathlib import Path 8 | from typing import Sequence, Tuple 9 | 10 | from dargs import Argument 11 | 12 | from dpdispatcher.arginfo import machine_dargs, resources_dargs, task_dargs 13 | 14 | # directory of examples 15 | p_examples = Path(__file__).parent.parent / "examples" 16 | 17 | machine_args = machine_dargs() 18 | resources_args = resources_dargs(detail_kwargs=False) 19 | task_args = task_dargs() 20 | 21 | # input_files : tuple[tuple[Argument, Path]] 22 | # tuple of example list 23 | input_files: Sequence[Tuple[Argument, Path]] = ( 24 | (machine_args, p_examples / "machine" / "expanse.json"), 25 | (machine_args, p_examples / "machine" / "lazy_local.json"), 26 | (machine_args, p_examples / "machine" / "mandu.json"), 27 | (machine_args, p_examples / "machine" / "ssh_proxy_command.json"), 28 | (resources_args, p_examples / "resources" / "expanse_cpu.json"), 29 | (resources_args, p_examples / "resources" / "mandu.json"), 30 | (resources_args, p_examples / "resources" / "tiger.json"), 31 | (task_args, p_examples / "task" / "deepmd-kit.json"), 32 | (task_args, p_examples / "task" / "g16.json"), 33 | ) 34 | 35 | 36 | class TestExamples(unittest.TestCase): 37 | def test_arguments(self): 38 | for arginfo, fn in input_files: 39 | fn = str(fn) 40 | with self.subTest(fn=fn): 41 | with open(fn) as f: 42 | data = json.load(f) 43 | arginfo.check_value(data, strict=True) 44 | -------------------------------------------------------------------------------- /tests/test_group_size.py: -------------------------------------------------------------------------------- 1 | """Test `Submission.generate_jobs` with different group size.""" 2 | 3 | import json 4 | import os 5 | import sys 6 | from pathlib import Path 7 | from unittest import TestCase 8 | 9 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 10 | __package__ = "tests" 11 | from .context import ( 12 | Machine, 13 | Resources, 14 | Submission, 15 | Task, 16 | setUpModule, # noqa: F401 17 | ) 18 | 19 | # 99 tasks in total 20 | # group_size - expected_ntasks 21 | group_ntasks_pairs = [ 22 | (1, 99), 23 | (3, 33), 24 | (10, 10), 25 | (100, 1), 26 | (0, 1), 27 | ] 28 | 29 | cwd = Path(__file__).parent 30 | with open(cwd / "jsons" / "machine.json") as f: 31 | j_machine = json.load(f)["machine"] 32 | with open(cwd / "jsons" / "resources.json") as f: 33 | j_resources = json.load(f) 34 | with open(cwd / "jsons" / "task.json") as f: 35 | j_task = json.load(f) 36 | 37 | 38 | class TestGroupSize(TestCase): 39 | def test_works_as_expected(self): 40 | for group_size, ntasks in group_ntasks_pairs: 41 | with self.subTest(group_size): 42 | machine = Machine.load_from_dict(j_machine) 43 | j_resources["group_size"] = group_size 44 | resources = Resources.load_from_dict(j_resources) 45 | tasks = [Task.load_from_dict(j_task) for _ in range(99)] 46 | submission = Submission(".", machine, resources, task_list=tasks) 47 | submission.generate_jobs() 48 | self.assertEqual(len(submission.belonging_jobs), ntasks) 49 | -------------------------------------------------------------------------------- /tests/test_gui.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: LGPL-3.0-or-later 2 | import unittest 3 | 4 | from dpgui import ( 5 | generate_dpgui_templates, 6 | ) 7 | 8 | 9 | class TestDPGUI(unittest.TestCase): 10 | def test_dpgui_entrypoints(self): 11 | self.assertTrue(len(generate_dpgui_templates()) > 0) 12 | -------------------------------------------------------------------------------- /tests/test_hdfs_context.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import sys 5 | import tarfile 6 | import unittest 7 | from glob import glob 8 | 9 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 10 | __package__ = "tests" 11 | 12 | from .context import ( 13 | HDFS, 14 | HDFSContext, 15 | Machine, 16 | setUpModule, # noqa: F401 17 | ) 18 | from .sample_class import SampleClass 19 | 20 | 21 | @unittest.skipIf(not shutil.which("hadoop"), "requires hadoop") 22 | class TestHDFSContext(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(cls): 25 | with open("jsons/machine_yarn.json") as f: 26 | mdata = json.load(f) 27 | cls.machine = Machine.load_from_dict(mdata["machine"]) 28 | cls.submission = SampleClass.get_sample_submission() 29 | cls.submission.bind_machine(cls.machine) 30 | cls.submission_hash = cls.submission.submission_hash 31 | 32 | def setUp(self): 33 | self.context = self.__class__.machine.context 34 | 35 | def test_0_hdfs_context(self): 36 | self.assertIsInstance(self.context, HDFSContext) 37 | 38 | def test_1_upload(self): 39 | self.context.upload(self.__class__.submission) 40 | 41 | def test_2_fake_run(self): 42 | rfile_tgz = ( 43 | self.context.remote_root 44 | + "/" 45 | + self.context.submission.submission_hash 46 | + "_upload.tgz" 47 | ) 48 | tmp_dir = "./tmp_fake_run" 49 | if os.path.exists(tmp_dir): 50 | shutil.rmtree(tmp_dir) 51 | os.mkdir(tmp_dir) 52 | self.assertTrue(HDFS.copy_to_local(rfile_tgz, tmp_dir)) 53 | 54 | cwd = os.getcwd() 55 | os.chdir(tmp_dir) 56 | tgz_file_list = glob("*_upload.tgz") 57 | for tgz in tgz_file_list: 58 | with tarfile.open(tgz, "r:gz") as tar: 59 | tar.extractall() 60 | os.remove(tgz) 61 | 62 | file_list = [ 63 | "bct-1/log.lammps", 64 | "bct-2/log.lammps", 65 | "bct-3/log.lammps", 66 | "bct-4/log.lammps", 67 | ] 68 | for fname in file_list: 69 | with open(fname, "w") as fp: 70 | fp.write("# mock log") 71 | 72 | file_list = glob("./*") 73 | download_tgz = self.context.submission.submission_hash + "_1_download.tar.gz" 74 | with tarfile.open(download_tgz, "w:gz", dereference=True) as tar: 75 | for ii in file_list: 76 | tar.add(ii) 77 | ret, _ = HDFS.copy_from_local(download_tgz, self.context.remote_root) 78 | self.assertTrue(ret) 79 | os.chdir(cwd) 80 | shutil.rmtree(tmp_dir) 81 | 82 | def test_3_download(self): 83 | self.context.download(self.__class__.submission) 84 | file_list = [ 85 | "bct-1/log.lammps", 86 | "bct-2/log.lammps", 87 | "bct-3/log.lammps", 88 | "bct-4/log.lammps", 89 | ] 90 | for fname in file_list: 91 | self.assertTrue( 92 | os.path.isfile(os.path.join(self.context.local_root, fname)) 93 | ) 94 | os.remove(os.path.join(self.context.local_root, fname)) 95 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-1/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-1/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-2/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-2/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-3/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-3/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-4/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/bct-4/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_hdfs_dir/0_md/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_if_cuda_multi_devices/test_dir/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/tests/test_if_cuda_multi_devices/test_dir/test.txt -------------------------------------------------------------------------------- /tests/test_import_classes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | from .context import ( 8 | dpdispatcher, 9 | setUpModule, # noqa: F401 10 | ) 11 | 12 | 13 | class TestImportClasses(unittest.TestCase): 14 | def setUp(self): 15 | self.maxDiff = None 16 | 17 | def test_import_class_Machine(self): 18 | from dpdispatcher import Machine 19 | 20 | self.assertEqual(dpdispatcher.machine.Machine, Machine) 21 | 22 | def test_import_class_Resources(self): 23 | from dpdispatcher import Resources 24 | 25 | self.assertEqual(dpdispatcher.submission.Resources, Resources) 26 | 27 | def test_import_class_Submission(self): 28 | from dpdispatcher import Submission 29 | 30 | self.assertEqual(dpdispatcher.submission.Submission, Submission) 31 | 32 | def test_import_class_Task(self): 33 | from dpdispatcher import Task 34 | 35 | self.assertEqual(dpdispatcher.submission.Task, Task) 36 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-1/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-1/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-2/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-2/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-3/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-3/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-4/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/bct-4/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_jh_unischeduler/0_md/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_lazy_local_context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import unittest 5 | from unittest.mock import MagicMock 6 | 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 8 | __package__ = "tests" 9 | from .context import ( 10 | LazyLocalContext, 11 | setUpModule, # noqa: F401 12 | ) 13 | 14 | 15 | class TestLazyLocalContext(unittest.TestCase): 16 | def setUp(self): 17 | # os.makedirs('loc', exist_ok = True) 18 | # os.makedirs('loc/task0', exist_ok = True) 19 | # os.makedirs('loc/task1', exist_ok = True) 20 | shutil.copytree(src="test_context_dir/", dst="tmp_lazy_local_context_dir/") 21 | 22 | self.lazy_local_context = LazyLocalContext( 23 | local_root="tmp_lazy_local_context_dir/" 24 | ) 25 | submission = MagicMock(work_base="0_md/") 26 | self.lazy_local_context.bind_submission(submission) 27 | 28 | def tearDown(self): 29 | shutil.rmtree("tmp_lazy_local_context_dir/") 30 | 31 | def test_upload(self): 32 | pass 33 | 34 | def test_download(self): 35 | pass 36 | 37 | # TODO: support other platforms 38 | @unittest.skipIf(sys.platform != "linux", "not linux") 39 | def test_block_call(self): 40 | code, stdin, stdout, stderr = self.lazy_local_context.block_call("ls") 41 | self.assertEqual( 42 | stdout.readlines(), 43 | [ 44 | "bct-1\n", 45 | "bct-2\n", 46 | "bct-3\n", 47 | "bct-4\n", 48 | "dir with space\n", 49 | "graph.pb\n", 50 | "some_dir\n", 51 | ], 52 | ) 53 | self.assertEqual(code, 0) 54 | 55 | code, stdin, stdout, stderr = self.lazy_local_context.block_call("ls a") 56 | self.assertEqual(code, 2) 57 | # self.assertEqual(stderr.read().decode('utf-8'), "ls: cannot access 'a': No such file or directory\n") 58 | err_msg = stderr.read().decode("utf-8") 59 | self.assertTrue("ls: cannot access" in err_msg) 60 | self.assertTrue("No such file or directory\n" in err_msg) 61 | 62 | # def test_block_checkcall(self) : 63 | # self.job = LazyLocalContext('loc', None) 64 | # tasks = ['task0', 'task1'] 65 | # files = ['test0', 'test1'] 66 | # self.job.upload(tasks, files) 67 | # # ls 68 | # stdin, stdout, stderr = self.job.block_checkcall('ls') 69 | # self.assertEqual(stdout.read().decode('utf-8'), 'task0\ntask1\n') 70 | # self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) 71 | # with self.assertRaises(RuntimeError): 72 | # stdin, stdout, stderr = self.job.block_checkcall('ls a') 73 | 74 | # def test_file(self) : 75 | # self.job = LazyLocalContext('loc', None) 76 | # self.assertFalse(self.job.check_file_exists('aaa')) 77 | # tmp = str(uuid.uuid4()) 78 | # self.job.write_file('aaa', tmp) 79 | # self.assertTrue(self.job.check_file_exists('aaa')) 80 | # tmp1 = self.job.read_file('aaa') 81 | # self.assertEqual(tmp, tmp1) 82 | 83 | # def test_call(self) : 84 | # self.job = LazyLocalContext('loc', None) 85 | # proc = self.job.call('sleep 3') 86 | # self.assertFalse(self.job.check_finish(proc)) 87 | # time.sleep(1) 88 | # self.assertFalse(self.job.check_finish(proc)) 89 | # time.sleep(2.5) 90 | # self.assertTrue(self.job.check_finish(proc)) 91 | # r,o,e=self.job.get_return(proc) 92 | # self.assertEqual(r, 0) 93 | # self.assertEqual(o.read(), b'') 94 | # self.assertEqual(e.read(), b'') 95 | # r,o,e=self.job.get_return(proc) 96 | # self.assertEqual(r, 0) 97 | # self.assertEqual(o, None) 98 | # self.assertEqual(e, None) 99 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-1/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-1/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-2/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-2/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-3/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-3/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-4/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/bct-4/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_lsf_dir/0_md/submission.json: -------------------------------------------------------------------------------- 1 | { 2 | "work_base": "0_md", 3 | "resources": { 4 | "number_node": 1, 5 | "cpu_per_node": 4, 6 | "gpu_per_node": 1, 7 | "queue_name": "V100_8_32", 8 | "group_size": 4, 9 | "if_cuda_multi_devices": true 10 | }, 11 | "forward_common_files": [ 12 | "graph.pb" 13 | ], 14 | "backward_common_files": [ 15 | "submission.json" 16 | ], 17 | "belonging_jobs": [ 18 | { 19 | "89936e3ac869b3132977da5cc4187725f3318ea3": { 20 | "job_task_list": [ 21 | { 22 | "command": "lmp_serial -i input.lammps", 23 | "task_work_path": "bct-3", 24 | "forward_files": [ 25 | "conf.lmp", 26 | "input.lammps" 27 | ], 28 | "backward_files": [ 29 | "log.lammps" 30 | ], 31 | "outlog": "log", 32 | "errlog": "err", 33 | "task_need_resources": 0.25 34 | }, 35 | { 36 | "command": "lmp_serial -i input.lammps", 37 | "task_work_path": "bct-2", 38 | "forward_files": [ 39 | "conf.lmp", 40 | "input.lammps" 41 | ], 42 | "backward_files": [ 43 | "log.lammps" 44 | ], 45 | "outlog": "log", 46 | "errlog": "err", 47 | "task_need_resources": 0.25 48 | }, 49 | { 50 | "command": "lmp_serial -i input.lammps", 51 | "task_work_path": "bct-4", 52 | "forward_files": [ 53 | "conf.lmp", 54 | "input.lammps" 55 | ], 56 | "backward_files": [ 57 | "log.lammps" 58 | ], 59 | "outlog": "log", 60 | "errlog": "err", 61 | "task_need_resources": 0.5 62 | }, 63 | { 64 | "command": "lmp_serial -i input.lammps", 65 | "task_work_path": "bct-1", 66 | "forward_files": [ 67 | "conf.lmp", 68 | "input.lammps" 69 | ], 70 | "backward_files": [ 71 | "log.lammps" 72 | ], 73 | "outlog": "log", 74 | "errlog": "err", 75 | "task_need_resources": 1 76 | } 77 | ], 78 | "resources": { 79 | "number_node": 1, 80 | "cpu_per_node": 4, 81 | "gpu_per_node": 1, 82 | "queue_name": "V100_8_32", 83 | "group_size": 4, 84 | "if_cuda_multi_devices": true 85 | }, 86 | "job_state": 5, 87 | "job_id": "21463.scheduler", 88 | "fail_count": 1 89 | } 90 | } 91 | ] 92 | } -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-1/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-1/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-2/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-2/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-3/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-3/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-4/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/bct-4/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_pbs_dir/0_md/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_retry.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | from .context import ( 8 | RetrySignal, 9 | retry, 10 | setUpModule, # noqa: F401 11 | ) 12 | 13 | 14 | class TestRetry(unittest.TestCase): 15 | def test_retry_fail(self): 16 | """Always retry.""" 17 | 18 | @retry(max_retry=3, sleep=0.05, catch_exception=RetrySignal) 19 | def some_method(): 20 | raise RetrySignal("Failed to do something") 21 | 22 | with self.assertRaises(RuntimeError): 23 | some_method() 24 | 25 | def test_retry_success(self): 26 | """Retry less than 3 times.""" 27 | retry_times = [0] 28 | 29 | @retry(max_retry=3, sleep=0.05, catch_exception=RetrySignal) 30 | def some_method(retry_times): 31 | if retry_times[0] < 2: 32 | retry_times[0] += 1 33 | raise RetrySignal("Failed to do something") 34 | 35 | some_method(retry_times) 36 | -------------------------------------------------------------------------------- /tests/test_rsync_flags.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | from unittest.mock import patch 5 | 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 7 | __package__ = "tests" 8 | 9 | from dpdispatcher.utils.utils import rsync 10 | 11 | 12 | class TestRsyncFlags(unittest.TestCase): 13 | """Test rsync function flags to ensure correct options are used.""" 14 | 15 | @patch("dpdispatcher.utils.utils.run_cmd_with_all_output") 16 | def test_rsync_flags_exclude_owner_group(self, mock_run_cmd): 17 | """Test that rsync uses flags that exclude owner and group preservation.""" 18 | # Mock successful command execution 19 | mock_run_cmd.return_value = (0, "", "") 20 | 21 | # Call rsync function 22 | rsync("source_file", "dest_file", key_filename="test_key") 23 | 24 | # Verify the command was called 25 | mock_run_cmd.assert_called_once() 26 | 27 | # Get the command that was executed 28 | called_cmd = mock_run_cmd.call_args[0][0] 29 | 30 | # Verify the command contains the correct flags 31 | self.assertIn("-rlptDz", called_cmd) 32 | self.assertNotIn("-az", called_cmd) 33 | 34 | # Verify rsync command structure 35 | self.assertIn("rsync", called_cmd) 36 | self.assertIn("source_file", called_cmd) 37 | self.assertIn("dest_file", called_cmd) 38 | self.assertIn("-e", called_cmd) 39 | self.assertIn("-q", called_cmd) 40 | 41 | @patch("dpdispatcher.utils.utils.run_cmd_with_all_output") 42 | def test_rsync_with_proxy_command_flags(self, mock_run_cmd): 43 | """Test that rsync uses correct flags even with proxy command.""" 44 | # Mock successful command execution 45 | mock_run_cmd.return_value = (0, "", "") 46 | 47 | # Call rsync function with proxy command 48 | rsync( 49 | "source_file", 50 | "dest_file", 51 | key_filename="test_key", 52 | proxy_command="ssh -W target:22 jump_host", 53 | ) 54 | 55 | # Verify the command was called 56 | mock_run_cmd.assert_called_once() 57 | 58 | # Get the command that was executed 59 | called_cmd = mock_run_cmd.call_args[0][0] 60 | 61 | # Verify the command contains the correct flags 62 | self.assertIn("-rlptDz", called_cmd) 63 | self.assertNotIn("-az", called_cmd) 64 | 65 | @patch("dpdispatcher.utils.utils.run_cmd_with_all_output") 66 | def test_rsync_error_handling(self, mock_run_cmd): 67 | """Test that rsync properly handles errors.""" 68 | # Mock failed command execution 69 | mock_run_cmd.return_value = ( 70 | 23, 71 | "", 72 | "rsync: chown failed: Operation not permitted", 73 | ) 74 | 75 | # Call rsync function and expect RuntimeError 76 | with self.assertRaises(RuntimeError) as context: 77 | rsync("source_file", "dest_file") 78 | 79 | # Verify error message contains the command and error 80 | self.assertIn("Failed to run", str(context.exception)) 81 | self.assertIn( 82 | "rsync: chown failed: Operation not permitted", str(context.exception) 83 | ) 84 | 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /tests/test_run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import unittest 5 | from pathlib import Path 6 | 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 8 | __package__ = "tests" 9 | 10 | from .context import run 11 | 12 | 13 | class TestRun(unittest.TestCase): 14 | def test_run(self): 15 | this_dir = Path(__file__).parent 16 | cwd = os.getcwd() 17 | with tempfile.TemporaryDirectory() as temp_dir: 18 | try: 19 | os.chdir(temp_dir) 20 | run(filename=str(this_dir / "hello_world.py")) 21 | self.assertEqual( 22 | (Path(temp_dir) / "log").read_text().strip(), "hello world!" 23 | ) 24 | finally: 25 | os.chdir(cwd) 26 | -------------------------------------------------------------------------------- /tests/test_run_submission_bohrium.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import textwrap 4 | import unittest 5 | from pathlib import Path 6 | 7 | sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) 8 | 9 | from test_run_submission import RunSubmission 10 | 11 | 12 | @unittest.skipIf( 13 | os.environ.get("DPDISPATCHER_TEST") != "bohrium", 14 | "outside the Bohrium testing environment", 15 | ) 16 | class TestBohriumRun(RunSubmission, unittest.TestCase): 17 | def setUp(self): 18 | super().setUp() 19 | self.machine_dict.update( 20 | batch_type="Bohrium", 21 | context_type="Bohrium", 22 | remote_profile={ 23 | "email": os.environ["BOHRIUM_EMAIL"], 24 | "password": os.environ["BOHRIUM_PASSWORD"], 25 | "project_id": int(os.environ["BOHRIUM_PROJECT_ID"]), 26 | "input_data": { 27 | "job_type": "indicate", 28 | "log_file": "log", 29 | "job_name": "dpdispather_test", 30 | "disk_size": 20, 31 | "scass_type": "c2_m4_cpu", 32 | "platform": "ali", 33 | "image_name": "registry.dp.tech/dptech/ubuntu:22.04-py3.10", 34 | "on_demand": 0, 35 | }, 36 | }, 37 | ) 38 | 39 | @unittest.skip("Manaually skip") # comment this line to open unittest 40 | def test_async_run_submission(self): 41 | return super().test_async_run_submission() 42 | 43 | 44 | @unittest.skipIf( 45 | os.environ.get("DPDISPATCHER_TEST") != "bohrium", 46 | "outside the Bohrium testing environment", 47 | ) 48 | class TestOpenAPIRun(RunSubmission, unittest.TestCase): 49 | def setUp(self): 50 | super().setUp() 51 | bohrium_config = textwrap.dedent( 52 | """\ 53 | [Credentials] 54 | accessKey={accesskey} 55 | """ 56 | ).format(accesskey=os.environ["BOHRIUM_ACCESS_KEY"]) 57 | Path.home().joinpath(".brmconfig").write_text(bohrium_config) 58 | self.machine_dict.update( 59 | batch_type="OpenAPI", 60 | context_type="OpenAPI", 61 | remote_profile={ 62 | "project_id": int(os.environ["BOHRIUM_PROJECT_ID"]), 63 | "machine_type": "c2_m4_cpu", 64 | "platform": "ali", 65 | "image_address": "registry.dp.tech/dptech/ubuntu:22.04-py3.10", 66 | "job_name": "dpdispather_test", 67 | }, 68 | ) 69 | 70 | @unittest.skip("Manaually skip") # comment this line to open unittest 71 | def test_async_run_submission(self): 72 | return super().test_async_run_submission() 73 | -------------------------------------------------------------------------------- /tests/test_shell_cuda_multi_devices.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import sys 5 | 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 7 | __package__ = "tests" 8 | import unittest 9 | 10 | from .context import ( 11 | Machine, 12 | Resources, 13 | Submission, 14 | Task, 15 | get_file_md5, 16 | setUpModule, # noqa: F401 17 | ) 18 | 19 | 20 | @unittest.skipIf(sys.platform == "win32", "Shell is not supported on Windows") 21 | class TestShellCudaMultiDevices(unittest.TestCase): 22 | def setUp(self): 23 | self.maxDiff = None 24 | 25 | def test_shell_cuda_multi_devices(self): 26 | with open("jsons/machine_if_cuda_multi_devices.json") as f: 27 | machine_dict = json.load(f) 28 | machine = Machine.load_from_dict(machine_dict["machine"]) 29 | resources = Resources.load_from_dict(machine_dict["resources"]) 30 | 31 | task_list = [] 32 | for ii in range(16): 33 | task = Task( 34 | command=f"echo dpdispatcher_unittest_{ii}", 35 | task_work_path="./", 36 | forward_files=[], 37 | backward_files=[], 38 | outlog="out.txt", 39 | ) 40 | task_list.append(task) 41 | 42 | submission = Submission( 43 | work_base="test_dir/", 44 | machine=machine, 45 | resources=resources, 46 | forward_common_files=["test.txt"], 47 | backward_common_files=["out.txt"], 48 | task_list=task_list, 49 | ) 50 | submission.run_submission(clean=False) 51 | 52 | for ii in ["test.txt"]: 53 | f1 = os.path.join("test_if_cuda_multi_devices/", "test_dir/", ii) 54 | f2 = os.path.join( 55 | "tmp_if_cuda_multi_devices/", submission.submission_hash, ii 56 | ) 57 | self.assertEqual(get_file_md5(f1), get_file_md5(f2)) 58 | 59 | self.assertTrue(os.path.isfile("test_if_cuda_multi_devices/test_dir/out.txt")) 60 | 61 | @classmethod 62 | def tearDownClass(cls): 63 | shutil.rmtree("tmp_if_cuda_multi_devices/") 64 | # pass 65 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/fail_dir/mock_fail_task.txt: -------------------------------------------------------------------------------- 1 | # mock file for unittest; test when dpdispatcher meets fail task 2 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/parent_dir/dir with space/example.txt: -------------------------------------------------------------------------------- 1 | dir with space 2 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/parent_dir/dir1/example.txt: -------------------------------------------------------------------------------- 1 | # example1.txt 2 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/parent_dir/dir2/example.txt: -------------------------------------------------------------------------------- 1 | # example2.txt 2 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/parent_dir/dir3/example.txt: -------------------------------------------------------------------------------- 1 | # example3.txt 2 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/parent_dir/dir4/example.txt: -------------------------------------------------------------------------------- 1 | # example4.txt 2 | -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/parent_dir/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_shell_trival_dir/recover_dir/mock_recover_task.txt: -------------------------------------------------------------------------------- 1 | # mock file for unittest; test when dpdispatcher need recover tasks 2 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-1/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-1/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-2/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-2/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-3/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-3/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-4/conf.lmp: -------------------------------------------------------------------------------- 1 | 2 | 2 atoms 3 | 1 atom types 4 | 0.0000000000 4.0000000000 xlo xhi 5 | 0.0000000000 4.0000000000 ylo yhi 6 | 0.0000000000 3.3800000000 zlo zhi 7 | 0.0000000000 0.0000000000 0.0000000000 xy xz yz 8 | 9 | Atoms # atomic 10 | 11 | 1 1 0.0000000000 0.0000000000 0.0000000000 12 | 2 1 2.0000000000 2.0000000000 1.6900000000 13 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/bct-4/input.lammps: -------------------------------------------------------------------------------- 1 | clear 2 | units metal 3 | dimension 3 4 | boundary p p p 5 | atom_style atomic 6 | # box tilt large 7 | read_data conf.lmp 8 | mass 1 118.71 9 | neigh_modify every 1 delay 0 check no 10 | pair_style deepmd ../graph.pb 11 | pair_coeff 12 | compute mype all pe 13 | compute mymsd all msd 14 | 15 | thermo 20 16 | thermo_style custom step temp pe pxx pyy pzz pxy pxz pyz lx ly lz vol c_mymsd[*] spcpu 17 | min_style cg 18 | fix 1 all box/relax iso 0.0 19 | minimize 1.000000e-12 1.000000e-06 5000 500000 20 | 21 | # timestep 0.002 22 | # velocity all create 2.0 7369221 23 | 24 | # fix 2 all npt temp 2.0 200.0 $(100.0*dt) aniso 0.0 200000.0 $(1000.0*dt) 25 | # run 2000 26 | # unfix 2 27 | 28 | dump 1 all custom 1 final.dump.relax id type xs ys zs fx fy fz 29 | run 10000 30 | 31 | write_data out.lmp 32 | -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/d3c842c5b9476e48f7145b370cd330372b9293e1.json: -------------------------------------------------------------------------------- 1 | { 2 | "work_base": "0_md", 3 | "resources": { 4 | "resources": { 5 | "number_node": 1, 6 | "cpu_per_node": 4, 7 | "gpu_per_node": 0, 8 | "queue_name": "debug", 9 | "group_size": 4, 10 | "if_cuda_multi_devices": false 11 | }, 12 | "slurm_sbatch_dict": { 13 | "mem": "10G", 14 | "cpus_per_task": 1, 15 | "time": "120:0:0" 16 | } 17 | }, 18 | "forward_common_files": [ 19 | "graph.pb" 20 | ], 21 | "backward_common_files": [ 22 | "*.json" 23 | ], 24 | "belonging_jobs": [ 25 | { 26 | "8cda6723de155874106d96a543e1872c9fc9aa1d": { 27 | "job_task_list": [ 28 | { 29 | "command": "/home/dp/deepmd-kit/bin/lmp -i input.lammps", 30 | "task_work_path": "bct-3", 31 | "forward_files": [ 32 | "conf.lmp", 33 | "input.lammps" 34 | ], 35 | "backward_files": [ 36 | "log.lammps" 37 | ], 38 | "outlog": "log", 39 | "errlog": "err", 40 | "task_need_resources": 0.25 41 | }, 42 | { 43 | "command": "/home/dp/deepmd-kit/bin/lmp -i input.lammps", 44 | "task_work_path": "bct-2", 45 | "forward_files": [ 46 | "conf.lmp", 47 | "input.lammps" 48 | ], 49 | "backward_files": [ 50 | "log.lammps" 51 | ], 52 | "outlog": "log", 53 | "errlog": "err", 54 | "task_need_resources": 0.25 55 | }, 56 | { 57 | "command": "/home/dp/deepmd-kit/bin/lmp -i input.lammps", 58 | "task_work_path": "bct-4", 59 | "forward_files": [ 60 | "conf.lmp", 61 | "input.lammps" 62 | ], 63 | "backward_files": [ 64 | "log.lammps" 65 | ], 66 | "outlog": "log", 67 | "errlog": "err", 68 | "task_need_resources": 0.5 69 | }, 70 | { 71 | "command": "/home/dp/deepmd-kit/bin/lmp -i input.lammps", 72 | "task_work_path": "bct-1", 73 | "forward_files": [ 74 | "conf.lmp", 75 | "input.lammps" 76 | ], 77 | "backward_files": [ 78 | "log.lammps" 79 | ], 80 | "outlog": "log", 81 | "errlog": "err", 82 | "task_need_resources": 1 83 | } 84 | ], 85 | "resources": { 86 | "resources": { 87 | "number_node": 1, 88 | "cpu_per_node": 4, 89 | "gpu_per_node": 0, 90 | "queue_name": "debug", 91 | "group_size": 4, 92 | "if_cuda_multi_devices": false 93 | }, 94 | "slurm_sbatch_dict": { 95 | "mem": "10G", 96 | "cpus_per_task": 1, 97 | "time": "120:0:0" 98 | } 99 | }, 100 | "job_state": 5, 101 | "job_id": "20", 102 | "fail_count": 1 103 | } 104 | } 105 | ] 106 | } -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/graph.pb: -------------------------------------------------------------------------------- 1 | ../../graph.pb -------------------------------------------------------------------------------- /tests/test_slurm_dir/0_md/submission.json: -------------------------------------------------------------------------------- 1 | { 2 | "work_base": "0_md", 3 | "resources": { 4 | "number_node": 1, 5 | "cpu_per_node": 4, 6 | "gpu_per_node": 1, 7 | "queue_name": "V100_8_32", 8 | "group_size": 4, 9 | "if_cuda_multi_devices": true 10 | }, 11 | "forward_common_files": [ 12 | "graph.pb" 13 | ], 14 | "backward_common_files": [ 15 | "submission.json" 16 | ], 17 | "belonging_jobs": [ 18 | { 19 | "89936e3ac869b3132977da5cc4187725f3318ea3": { 20 | "job_task_list": [ 21 | { 22 | "command": "lmp_serial -i input.lammps", 23 | "task_work_path": "bct-3", 24 | "forward_files": [ 25 | "conf.lmp", 26 | "input.lammps" 27 | ], 28 | "backward_files": [ 29 | "log.lammps" 30 | ], 31 | "outlog": "log", 32 | "errlog": "err", 33 | "task_need_resources": 0.25 34 | }, 35 | { 36 | "command": "lmp_serial -i input.lammps", 37 | "task_work_path": "bct-2", 38 | "forward_files": [ 39 | "conf.lmp", 40 | "input.lammps" 41 | ], 42 | "backward_files": [ 43 | "log.lammps" 44 | ], 45 | "outlog": "log", 46 | "errlog": "err", 47 | "task_need_resources": 0.25 48 | }, 49 | { 50 | "command": "lmp_serial -i input.lammps", 51 | "task_work_path": "bct-4", 52 | "forward_files": [ 53 | "conf.lmp", 54 | "input.lammps" 55 | ], 56 | "backward_files": [ 57 | "log.lammps" 58 | ], 59 | "outlog": "log", 60 | "errlog": "err", 61 | "task_need_resources": 0.5 62 | }, 63 | { 64 | "command": "lmp_serial -i input.lammps", 65 | "task_work_path": "bct-1", 66 | "forward_files": [ 67 | "conf.lmp", 68 | "input.lammps" 69 | ], 70 | "backward_files": [ 71 | "log.lammps" 72 | ], 73 | "outlog": "log", 74 | "errlog": "err", 75 | "task_need_resources": 1 76 | } 77 | ], 78 | "resources": { 79 | "number_node": 1, 80 | "cpu_per_node": 4, 81 | "gpu_per_node": 1, 82 | "queue_name": "V100_8_32", 83 | "group_size": 4, 84 | "if_cuda_multi_devices": true 85 | }, 86 | "job_state": 5, 87 | "job_id": "21463.scheduler", 88 | "fail_count": 1 89 | } 90 | } 91 | ] 92 | } -------------------------------------------------------------------------------- /tests/test_ssh_jump_host.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 6 | __package__ = "tests" 7 | from .context import ( 8 | SSHSession, 9 | setUpModule, # noqa: F401 10 | ) 11 | 12 | 13 | @unittest.skipIf( 14 | os.environ.get("DPDISPATCHER_TEST") != "ssh", "outside the ssh testing environment" 15 | ) 16 | class TestSSHJumpHost(unittest.TestCase): 17 | """Test SSH jump host functionality.""" 18 | 19 | def test_proxy_command_connection(self): 20 | """Test SSH connection using proxy_command via jump host.""" 21 | # Test connection from test -> server via jumphost 22 | ssh_session = SSHSession( 23 | hostname="server", 24 | username="root", 25 | key_filename="/root/.ssh/id_rsa", 26 | proxy_command="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i /root/.ssh/id_rsa -W server:22 root@jumphost", 27 | ) 28 | 29 | # Verify the connection was established 30 | self.assertIsNotNone(ssh_session.ssh) 31 | self.assertTrue(ssh_session._check_alive()) 32 | 33 | # Test running a simple command through the proxy 34 | assert ssh_session.ssh is not None # for type checker 35 | stdin, stdout, stderr = ssh_session.ssh.exec_command("echo 'test via proxy'") 36 | output = stdout.read().decode().strip() 37 | self.assertEqual(output, "test via proxy") 38 | 39 | # Verify proxy_command attribute is set correctly 40 | self.assertEqual( 41 | ssh_session.proxy_command, 42 | "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i /root/.ssh/id_rsa -W server:22 root@jumphost", 43 | ) 44 | 45 | ssh_session.close() 46 | 47 | def test_direct_connection_no_proxy(self): 48 | """Test direct SSH connection without proxy command.""" 49 | # Test direct connection from test -> server (no proxy) 50 | ssh_session = SSHSession( 51 | hostname="server", username="root", key_filename="/root/.ssh/id_rsa" 52 | ) 53 | 54 | # Verify the connection was established 55 | self.assertIsNotNone(ssh_session.ssh) 56 | self.assertTrue(ssh_session._check_alive()) 57 | 58 | # Test running a simple command 59 | assert ssh_session.ssh is not None # for type checker 60 | stdin, stdout, stderr = ssh_session.ssh.exec_command("echo 'test direct'") 61 | output = stdout.read().decode().strip() 62 | self.assertEqual(output, "test direct") 63 | 64 | # Verify no proxy_command is set 65 | self.assertIsNone(ssh_session.proxy_command) 66 | 67 | ssh_session.close() 68 | 69 | def test_jump_host_direct_connection(self): 70 | """Test direct connection to jump host itself.""" 71 | # Test direct connection from test -> jumphost 72 | ssh_session = SSHSession( 73 | hostname="jumphost", username="root", key_filename="/root/.ssh/id_rsa" 74 | ) 75 | 76 | # Verify the connection was established 77 | self.assertIsNotNone(ssh_session.ssh) 78 | self.assertTrue(ssh_session._check_alive()) 79 | 80 | # Test running a command on jumphost 81 | assert ssh_session.ssh is not None # for type checker 82 | stdin, stdout, stderr = ssh_session.ssh.exec_command("hostname") 83 | output = stdout.read().decode().strip() 84 | self.assertEqual(output, "jumphost") 85 | 86 | ssh_session.close() 87 | 88 | 89 | if __name__ == "__main__": 90 | unittest.main() 91 | -------------------------------------------------------------------------------- /tests/test_work_path/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepmodeling/dpdispatcher/4816095c9e711259877fb90023ce74ce527ba5c3/tests/test_work_path/.gitkeep --------------------------------------------------------------------------------