├── agentd ├── __init__.py ├── chromium.py ├── logging_config.py ├── firefox.py ├── models.py ├── celery_worker.py ├── util.py └── server.py ├── meta ├── vendor-data └── meta-data ├── tests ├── __init__.py └── test_server.py ├── .prettierignore ├── .python-version ├── .dockerignore ├── root_meta ├── meta-data └── user-data ├── scripts ├── lint.py └── build_docs.py ├── theme ├── enable-compositing.desktop ├── xfce4-desktop.xml ├── xsettings.xml ├── xfce4-panel.xml └── xfwm4.xml ├── .flake8 ├── conf ├── xvfb.service ├── websockify.service ├── dconf.service ├── gnome.service ├── openbox.service ├── lxqt.service ├── agentd.service ├── x11vnc.service └── kasm │ └── run ├── uvicorn_run ├── redis_run ├── user-data.tpl ├── docs ├── index.rst ├── Makefile ├── browser.rst ├── screenshots.rst ├── make.bat ├── conf.py ├── keyboard.rst ├── info.rst ├── mouse.rst └── recordings.rst ├── logging_config.yaml ├── xconf_run ├── .github └── workflows │ ├── poetry-lint.yml │ ├── poetry-tests.yml │ ├── poetry-docs.yml │ └── docker-image.yml ├── remote_install_server.sh ├── install_deps.sh ├── install_desktop.sh ├── remote_install.sh ├── LICENSE ├── pyproject.toml ├── xfce4-desktop.xml ├── pack.sh ├── pack_server.sh ├── cloudbuild_old.yaml ├── CONTRIBUTING.md ├── Makefile ├── CODE_OF_CONDUCT.md ├── cloudbuild.yaml ├── .gitignore ├── install.sh ├── base.pkr.hcl ├── server.pkr.hcl ├── README.md ├── Dockerfile └── Dockerfile.loaded /agentd/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /meta/vendor-data: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | *.md 2 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12.4 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .github 3 | docs/ 4 | .recordings -------------------------------------------------------------------------------- /meta/meta-data: -------------------------------------------------------------------------------- 1 | instance-id: agentd 2 | local-hostname: agentd -------------------------------------------------------------------------------- /root_meta/meta-data: -------------------------------------------------------------------------------- 1 | instance-id: agentd 2 | local-hostname: agentd -------------------------------------------------------------------------------- /scripts/lint.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def main(): 5 | subprocess.run(["black", "."]) 6 | subprocess.run(["flake8", "."]) 7 | -------------------------------------------------------------------------------- /theme/enable-compositing.desktop: -------------------------------------------------------------------------------- 1 | [Desktop Entry] 2 | Type=Application 3 | Name=Enable xfwm4 Compositing 4 | Exec=xfconf-query -c xfwm4 -p /general/use_compositing -s true 5 | Terminal=false 6 | NoDisplay=true -------------------------------------------------------------------------------- /root_meta/user-data: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | password: ubuntu 3 | ssh_pwauth: true 4 | chpasswd: 5 | expire: false 6 | # users: 7 | # - name: agentsea 8 | # sudo: ALL=(ALL) NOPASSWD:ALL 9 | # groups: sudo 10 | # shell: /bin/bash 11 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E203, E266, E501, W503 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | exclude = 7 | .git, 8 | __pycache__, 9 | build, 10 | dist, 11 | .venv, 12 | .tox, 13 | .mypy_cache, 14 | .pytest_cache, 15 | .vscode, 16 | -------------------------------------------------------------------------------- /conf/xvfb.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=X Virtual Frame Buffer Service 3 | After=network.target 4 | 5 | [Service] 6 | ExecStart=/usr/bin/Xvfb :99 -screen 0 1280x1024x24 7 | Environment="XAUTHORITY=/home/agentsea/.Xauthority" "DISPLAY=:99" 8 | User=agentsea 9 | Restart=on-failure 10 | RestartSec=2 11 | 12 | [Install] 13 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /conf/websockify.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Websockify Service 3 | After=x11vnc.service network.target xvfb.service 4 | 5 | [Service] 6 | ExecStart=/usr/bin/websockify 6080 localhost:5900 7 | Restart=on-failure 8 | User=agentsea 9 | RestartSec=11s 10 | StartLimitBurst=5 11 | StartLimitIntervalSec=60s 12 | 13 | [Install] 14 | WantedBy=multi-user.target 15 | -------------------------------------------------------------------------------- /conf/dconf.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Apply dconf settings for GNOME 3 | Requires=gnome.service 4 | After=gnome.service 5 | 6 | [Service] 7 | Type=oneshot 8 | User=agentsea 9 | Environment="DISPLAY=:99" 10 | ExecStart=/bin/su agentsea -c "dconf write /org/gnome/initial-setup/done true" 11 | # Replace 1000 with agentsea's UID 12 | 13 | [Install] 14 | WantedBy=multi-user.target 15 | -------------------------------------------------------------------------------- /conf/gnome.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=GNOME session on Xvfb 3 | Requires=xvfb.service 4 | After=xvfb.service 5 | PartOf=xvfb.service 6 | 7 | [Service] 8 | Type=forking 9 | User=agentsea 10 | Environment="DISPLAY=:99" "XAUTHORITY=/home/agentsea/.Xauthority" 11 | ExecStart=/usr/bin/dbus-launch gnome-session 12 | ExecStop=/usr/bin/killall gnome-session 13 | 14 | [Install] 15 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /conf/openbox.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Openbox Window Manager 3 | Requires=xvfb.service 4 | After=xvfb.service network.target 5 | 6 | [Service] 7 | Type=simple 8 | User=agentsea 9 | Environment="DISPLAY=:99" "XAUTHORITY=/home/agentsea/.Xauthority" 10 | ExecStart=/usr/bin/openbox --config-file /home/agentsea/.config/openbox/rc.xml 11 | Restart=on-failure 12 | RestartSec=5 13 | 14 | [Install] 15 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /uvicorn_run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | echo "Test log message from uvicorn_run script" >&2 5 | 6 | cd /config/app 7 | 8 | chown -R abc:abc /config/app 9 | 10 | source ./pyenv_setup.sh 11 | source ./venv/bin/activate 12 | 13 | export DISPLAY=:1 14 | 15 | exec s6-setuidgid abc uvicorn agentd.server:app \ 16 | --host 0.0.0.0 --port 8000 --log-level debug \ 17 | --log-config /config/app/logging_config.yaml -------------------------------------------------------------------------------- /conf/lxqt.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Start LXQt on Xvfb 3 | Requires=xvfb.service 4 | After=xvfb.service network.target 5 | 6 | [Service] 7 | Type=simple 8 | User=agentsea 9 | Environment="DISPLAY=:99" "XAUTHORITY=/home/agentsea/.Xauthority" 10 | Environment="DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1001/bus" 11 | ExecStart=/usr/bin/startlxqt 12 | Restart=on-failure 13 | RestartSec=5 14 | 15 | [Install] 16 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /conf/agentd.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=An agent daemon that gives AI agents desktop access 3 | After=network.target xvfb.service 4 | 5 | [Service] 6 | User=agentsea 7 | Environment="DISPLAY=:99" "XAUTHORITY=/home/agentsea/.Xauthority" 8 | WorkingDirectory=/home/agentsea/agentd 9 | ExecStart=/home/agentsea/.local/bin/uvicorn agentd.server:app --host 0.0.0.0 --port 8000 --reload 10 | Restart=always 11 | 12 | [Install] 13 | WantedBy=graphical.target -------------------------------------------------------------------------------- /redis_run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | echo "Test log message from redis_run script" >&2 4 | 5 | 6 | # Log the environment variables 7 | env > /config/app/logs/redis_env.log 8 | 9 | # exec a shell to Run Redis and pipe to sed to hack a prefix 10 | exec sh -c "redis-server \ 11 | --bind 0.0.0.0 \ 12 | --maxmemory ${MAXMEMORY:-512mb} \ 13 | --maxmemory-policy allkeys-lru \ 14 | --loglevel ${LOG_LEVEL:-notice} \ 15 | 2>&1 | sed 's/^/[redis] /'" -------------------------------------------------------------------------------- /user-data.tpl: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | chpasswd: 3 | list: | 4 | agentsea:sailor 5 | expire: False 6 | users: 7 | - name: agentsea 8 | ssh_authorized_keys: 9 | - {{ ssh_public_key }} 10 | sudo: ALL=(ALL) NOPASSWD:ALL 11 | groups: sudo 12 | shell: /bin/bash 13 | runcmd: 14 | - growpart /dev/sda 1 15 | - resize2fs /dev/sda1 16 | - "curl -sSL https://raw.githubusercontent.com/agentsea/agentd/main/remote_install.sh | sudo bash" 17 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | AgentD API Reference 2 | ==================== 3 | 4 | The ``AgentD`` demon provides a number of HTTP endpoints for interacting with the VM via HTTP. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: API Reference 9 | 10 | info 11 | mouse 12 | keyboard 13 | browser 14 | screenshots 15 | recordings 16 | 17 | .. toctree:: 18 | :maxdepth: 1 19 | :caption: ↪ 20 | 21 | Go to User Guide 22 | -------------------------------------------------------------------------------- /scripts/build_docs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | 5 | 6 | def main(): 7 | # Define the build directory (this is the default for Sphinx) 8 | build_dir = "docs/_build/html" 9 | 10 | # Check if the build directory exists and remove it 11 | if os.path.exists(build_dir): 12 | shutil.rmtree(build_dir) 13 | 14 | # Now, run the Sphinx build command 15 | subprocess.run(["sphinx-build", "-b", "html", "docs/", build_dir]) 16 | 17 | 18 | # This allows the script to be run from the command line 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /conf/x11vnc.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=x11vnc service 3 | After=display-manager.service network.target syslog.target xvfb.service 4 | 5 | [Service] 6 | Type=simple 7 | User=agentsea 8 | Environment="XAUTHORITY=/home/agentsea/.Xauthority" "DISPLAY=:99" 9 | ExecStartPre=/bin/sleep 10 10 | ExecStart=/usr/bin/x11vnc -forever -display :99 -auth /home/agentsea/.Xauthority -passwd agentsea123 -shared -verbose -rfbport 5900 11 | ExecStop=/usr/bin/killall x11vnc 12 | Restart=on-failure 13 | RestartSec=11s 14 | StartLimitBurst=5 15 | StartLimitIntervalSec=60s 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | -------------------------------------------------------------------------------- /logging_config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | 4 | formatters: 5 | custom: 6 | format: "[uvicorn] %(asctime)s %(levelname)s %(message)s" 7 | datefmt: "%Y-%m-%d %H:%M:%S" 8 | 9 | handlers: 10 | console: 11 | class: logging.StreamHandler 12 | formatter: custom 13 | stream: ext://sys.stdout 14 | 15 | loggers: 16 | uvicorn: 17 | level: DEBUG 18 | handlers: [console] 19 | propagate: no 20 | uvicorn.error: 21 | level: DEBUG 22 | handlers: [console] 23 | propagate: no 24 | uvicorn.access: 25 | level: DEBUG 26 | handlers: [console] 27 | propagate: no -------------------------------------------------------------------------------- /xconf_run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Debugging: show commands as they run 4 | set -x 5 | 6 | # 1) Make sure HOME points to the user’s config directory 7 | export HOME=/config 8 | export USER=abc 9 | 10 | # 2) Set DISPLAY and possibly XAUTHORITY 11 | export DISPLAY=:0 12 | export XAUTHORITY=/config/.Xauthority 13 | 14 | # Wait for xfwm4 (and the X server) to finish starting 15 | sleep 10 16 | 17 | echo "Setting compositing to true" 18 | 19 | # 3) Run xfconf-query as user "abc" 20 | exec s6-setuidgid abc xfconf-query -c xfwm4 -p /general/use_compositing -s true 21 | # exec s6-setuidgid abc xfwm4 --replace &g 22 | 23 | echo "Set compositing to true" -------------------------------------------------------------------------------- /.github/workflows/poetry-lint.yml: -------------------------------------------------------------------------------- 1 | name: Poetry Lint 2 | 3 | on: 4 | push: 5 | branches: [ '**' ] 6 | pull_request: 7 | branches: [ '**' ] 8 | 9 | jobs: 10 | lint: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.10' 21 | 22 | - name: Install Poetry 23 | uses: snok/install-poetry@v1 24 | 25 | - name: Install dependencies 26 | run: | 27 | poetry install 28 | 29 | - name: Run lint 30 | run: | 31 | poetry run lint 32 | 33 | -------------------------------------------------------------------------------- /.github/workflows/poetry-tests.yml: -------------------------------------------------------------------------------- 1 | name: Poetry Tests 2 | 3 | on: 4 | push: 5 | branches: [ '**' ] 6 | pull_request: 7 | branches: [ '**' ] 8 | 9 | jobs: 10 | test: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.10' 21 | 22 | - name: Install Poetry 23 | uses: snok/install-poetry@v1 24 | 25 | - name: Install dependencies 26 | run: | 27 | poetry install 28 | 29 | - name: Run tests 30 | uses: coactions/setup-xvfb@v1 31 | with: 32 | run: poetry run pytest -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /conf/kasm/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/with-contenv bash 2 | 3 | # Pass gpu flags if mounted 4 | if ls /dev/dri/renderD* 1> /dev/null 2>&1 && [ -z ${DISABLE_DRI+x} ] && ! which nvidia-smi; then 5 | HW3D="-hw3d" 6 | fi 7 | if [ -z ${DRINODE+x} ]; then 8 | DRINODE="/dev/dri/renderD128" 9 | fi 10 | 11 | exec s6-setuidgid abc \ 12 | /usr/local/bin/Xvnc $DISPLAY \ 13 | ${HW3D} \ 14 | -PublicIP 127.0.0.1 \ 15 | -drinode ${DRINODE} \ 16 | -disableBasicAuth \ 17 | -SecurityTypes None \ 18 | -AlwaysShared \ 19 | -http-header Cross-Origin-Embedder-Policy=require-corp \ 20 | -http-header Cross-Origin-Opener-Policy=same-origin \ 21 | -geometry 1280x800 \ 22 | -sslOnly 0 \ 23 | -RectThreads 0 \ 24 | -websocketPort 6901 \ 25 | -interface 0.0.0.0 \ 26 | -Log *:stdout:10 -------------------------------------------------------------------------------- /docs/browser.rst: -------------------------------------------------------------------------------- 1 | Browser Operations 2 | ================== 3 | 4 | POST /open_url 5 | ^^^^^^^^^^^^^^ 6 | 7 | The ``/open_url`` endpoint opens a specified URL in the Chromium browser. 8 | 9 | **Request:** 10 | 11 | .. code-block:: json 12 | 13 | { 14 | "url": "https://example.com" 15 | } 16 | 17 | Attributes: 18 | 19 | - ``url`` (str): The URL to be opened in the browser. 20 | 21 | **Response:** 22 | 23 | Returns a JSON response indicating the status of the operation. 24 | 25 | .. code-block:: json 26 | 27 | { 28 | "status": "success" 29 | } 30 | 31 | Possible ``status`` values: 32 | 33 | - ``success``: The URL was successfully opened in the browser. 34 | - ``error``: An error occurred while attempting to open the URL. An additional ``message`` field will provide details about the error. 35 | -------------------------------------------------------------------------------- /docs/screenshots.rst: -------------------------------------------------------------------------------- 1 | Making Screenshots 2 | =================== 3 | 4 | POST /screenshot 5 | ^^^^^^^^^^^^^^^^ 6 | 7 | The ``/screenshot`` endpoint captures the current screen and returns an image. 8 | 9 | **Request:** 10 | 11 | No parameters required. 12 | 13 | **Response:** 14 | 15 | Returns a JSON response containing the screenshot image encoded in base64 and the file path where the screenshot is saved. 16 | 17 | .. code-block:: json 18 | 19 | { 20 | "status": "success", 21 | "image": "base64_encoded_image", 22 | "file_path": "path/to/screenshot.png" 23 | } 24 | 25 | Possible ``status`` values: 26 | 27 | - ``success``: The screenshot was successfully captured and returned. 28 | - ``error``: An error occurred while attempting to capture the screenshot. An additional ``message`` field will provide details about the error. 29 | -------------------------------------------------------------------------------- /remote_install_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define where to clone the repository 4 | INSTALL_DIR="/home/agentsea/agentd" 5 | rm -rf $INSTALL_DIR 6 | 7 | # Clone the repository 8 | echo "Cloning repository into $INSTALL_DIR..." 9 | git clone https://github.com/agentsea/agentd.git "$INSTALL_DIR" 10 | chown -R agentsea:agentsea $INSTALL_DIR 11 | 12 | # Check if git clone was successful 13 | if [ $? -ne 0 ]; then 14 | echo "Failed to clone the repository. Please check your internet connection and repository URL." 15 | exit 1 16 | fi 17 | 18 | # Change directory to the cloned repository 19 | cd "$INSTALL_DIR" 20 | 21 | apt install -y xdotool 22 | 23 | # whoami 24 | # bash install_deps.sh 25 | 26 | # Assuming your script uses other scripts or configurations from the repo 27 | # Execute a specific script from the cloned repository 28 | echo "Installation completed." 29 | -------------------------------------------------------------------------------- /.github/workflows/poetry-docs.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy Sphinx Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build-and-deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.10' 18 | 19 | - name: Install Poetry 20 | uses: snok/install-poetry@v1 21 | 22 | - name: Install dependencies 23 | run: | 24 | poetry install 25 | 26 | - name: Build Sphinx Documentation 27 | run: | 28 | poetry run build-docs 29 | 30 | - name: Deploy to GitHub Pages 31 | uses: peaceiris/actions-gh-pages@v3 32 | with: 33 | github_token: ${{ secrets.GITHUB_TOKEN }} 34 | publish_dir: ./docs/_build/html 35 | publish_branch: gh-pages 36 | -------------------------------------------------------------------------------- /install_deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$(whoami)" != "agentsea" ]; then 4 | echo "This script must be run as the user 'agentsea'. Exiting..." 5 | exit 1 6 | fi 7 | 8 | # Define the path to be added 9 | PATH_TO_ADD="/home/agentsea/.local/bin" 10 | 11 | # Define the profile file 12 | PROFILE_FILE="/home/agentsea/.bashrc" 13 | 14 | # Check if the path is already in the PATH variable within the profile file 15 | if ! grep -qxF "export PATH=\"\$PATH:$PATH_TO_ADD\"" $PROFILE_FILE; then 16 | # If the path is not in the file, append the export command to the profile file 17 | echo "export PATH=\"\$PATH:$PATH_TO_ADD\"" >> $PROFILE_FILE 18 | echo "Path $PATH_TO_ADD added to PATH permanently for user agentsea." 19 | else 20 | echo "Path $PATH_TO_ADD is already in PATH for user agentsea." 21 | fi 22 | 23 | export PATH="$PATH:$PATH_TO_ADD" 24 | 25 | python3 -m pip install mss "fastapi[all]" pyautogui pynput "uvicorn[standard]" psutil 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /install_desktop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $EUID -ne 0 ]]; then 4 | echo "This script must be run as root (or with sudo). Exiting..." 5 | exit 1 6 | fi 7 | 8 | echo "creating user..." 9 | adduser --disabled-password --gecos '' agentsea 10 | chown -R agentsea:agentsea /home/agentsea 11 | echo 'agentsea ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/agentsea 12 | 13 | echo "installing base packages..." 14 | apt-get update 15 | apt-get install -y xvfb ubuntu-desktop x11vnc websockify python3-pip python3-dev python3-venv 16 | snap install chromium 17 | 18 | echo "setting up firewall..." 19 | ufw_status=$(ufw status | grep -o "inactive") 20 | if [ "$ufw_status" == "inactive" ]; then 21 | echo "UFW is inactive. Enabling..." 22 | ufw enable 23 | fi 24 | 25 | # ssh 26 | ufw allow 22/tcp 27 | ufw reload 28 | 29 | 30 | cloud-init clean --logs 31 | truncate -s 0 /etc/machine-id 32 | rm /var/lib/dbus/machine-id 33 | ln -s /etc/machine-id /var/lib/dbus/machine-id -------------------------------------------------------------------------------- /remote_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define where to clone the repository 4 | INSTALL_DIR="/home/agentsea/agentd" 5 | if [ -d "$INSTALL_DIR" ]; then 6 | echo "$INSTALL_DIR already exists. Consider removing it first if you want a fresh install." 7 | exit 1 8 | fi 9 | 10 | # Clone the repository 11 | echo "Cloning repository into $INSTALL_DIR..." 12 | git clone https://github.com/agentsea/agentd.git "$INSTALL_DIR" 13 | 14 | # Check if git clone was successful 15 | if [ $? -ne 0 ]; then 16 | echo "Failed to clone the repository. Please check your internet connection and repository URL." 17 | exit 1 18 | fi 19 | 20 | # Change directory to the cloned repository 21 | cd "$INSTALL_DIR" 22 | 23 | # Assuming your script uses other scripts or configurations from the repo 24 | # Execute a specific script from the cloned repository 25 | echo "Running installation script from the cloned repository..." 26 | bash install.sh 27 | 28 | echo "Installation completed." 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kentauros AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | import os 7 | import sys 8 | 9 | # -- Project information ----------------------------------------------------- 10 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 11 | 12 | project = 'agentd' 13 | copyright = '2024, Kentauros AI' 14 | author = 'Kentauros AI' 15 | release = '0.1.0' 16 | 17 | extensions = [ 18 | "sphinx.ext.autodoc", 19 | "sphinx.ext.viewcode", 20 | "sphinx.ext.napoleon", 21 | "recommonmark", 22 | ] 23 | 24 | source_suffix = [".rst", ".md"] 25 | 26 | templates_path = ["_templates"] 27 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 28 | 29 | # -- Options for HTML output ------------------------------------------------- 30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 31 | 32 | html_theme = "sphinx_rtd_theme" 33 | 34 | # -- Source files location ---------------------------------------------------- 35 | 36 | sys.path.insert(0, os.path.abspath("../agentd")) 37 | -------------------------------------------------------------------------------- /agentd/chromium.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import signal 4 | 5 | 6 | def is_chromium_running() -> list: 7 | """ 8 | Checks if Chromium is running and returns a list of PIDs. 9 | """ 10 | try: 11 | output = subprocess.check_output(["pgrep", "-f", "chromium"]) 12 | return [int(pid) for pid in output.decode().strip().split("\n")] 13 | except subprocess.CalledProcessError: 14 | return [] 15 | 16 | 17 | def is_chromium_window_open(): 18 | try: 19 | output = subprocess.check_output(["wmctrl", "-l", "-x"]) 20 | return "Chromium" in output.decode() 21 | except subprocess.CalledProcessError: 22 | return False 23 | 24 | 25 | def gracefully_terminate_chromium(pids: list): 26 | """ 27 | Attempts to gracefully terminate Chromium processes given their PIDs. 28 | """ 29 | for pid in pids: 30 | try: 31 | os.kill(pid, signal.SIGTERM) 32 | print(f"Sent SIGTERM to Chromium process {pid}.") 33 | except ProcessLookupError: 34 | print(f"Chromium process {pid} not found.") 35 | except Exception as e: 36 | print(f"Error terminating Chromium process {pid}: {e}") 37 | -------------------------------------------------------------------------------- /agentd/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | 4 | LOGGING_CONFIG = { 5 | "version": 1, 6 | "disable_existing_loggers": False, 7 | "formatters": { 8 | "api_formatter": { 9 | "format": "[api] %(asctime)s %(levelname)s [%(funcName)s]: %(message)s", 10 | "datefmt": "%Y-%m-%d %H:%M:%S", 11 | }, 12 | "recording_formatter": { 13 | "format": "[recording] %(asctime)s %(levelname)s [%(funcName)s]: %(message)s", 14 | "datefmt": "%Y-%m-%d %H:%M:%S", 15 | }, 16 | }, 17 | "handlers": { 18 | "api_console": { 19 | "class": "logging.StreamHandler", 20 | "formatter": "api_formatter", 21 | "stream": "ext://sys.stdout", 22 | }, 23 | "recording_console": { 24 | "class": "logging.StreamHandler", 25 | "formatter": "recording_formatter", 26 | "stream": "ext://sys.stdout", 27 | }, 28 | }, 29 | "loggers": { 30 | "api": { 31 | "handlers": ["api_console"], 32 | "level": "INFO", 33 | "propagate": False, 34 | }, 35 | "recording": { 36 | "handlers": ["recording_console"], 37 | "level": "INFO", 38 | "propagate": False, 39 | }, 40 | }, 41 | } -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "agentd" 3 | version = "0.1.0" 4 | description = "A daemon that makes a desktop OS accessible to AI agents" 5 | authors = ["Patrick Barker "] 6 | license = "Apache 2.0" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.12" 11 | pyautogui = "^0.9.54" 12 | mss = "^9.0.1" 13 | pynput = "^1.7.6" 14 | psutil = "^5.9.8" 15 | python-xlib = "^0.33" 16 | pillow = "^10.4.0" 17 | pyscreeze = "^1.0.1" 18 | fastapi = {version = "0.109", extras = ["all"]} 19 | tiktoken = "0.7.0" 20 | celery = "^5.4.0" 21 | celery-types = "^0.22.0" 22 | redis = "^5.2.1" 23 | taskara = "^0.1.225" 24 | 25 | [tool.poetry.group.dev.dependencies] 26 | pytest = "^8.1.0" 27 | pytest-asyncio = "^0.23.5" 28 | flake8 = "^7.0.0" 29 | black = "^24.2.0" 30 | sphinx = "^7.2.6" 31 | sphinx-rtd-theme = "^2.0.0" 32 | recommonmark = "^0.7.1" 33 | 34 | [tool.pyright] 35 | reportUnknownParameterType = false 36 | reportMissingTypeArgument = false 37 | reportUnknownMemberType = false 38 | reportUnknownVariableType = false 39 | reportUnknownArgumentType = false 40 | reportPrivateUsage = false 41 | reportMissingParameterType = false 42 | 43 | [build-system] 44 | requires = ["poetry-core"] 45 | build-backend = "poetry.core.masonry.api" 46 | 47 | [tool.poetry.scripts] 48 | build-docs = "scripts.build_docs:main" 49 | lint = "scripts.lint:main" 50 | -------------------------------------------------------------------------------- /theme/xfce4-desktop.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /xfce4-desktop.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/keyboard.rst: -------------------------------------------------------------------------------- 1 | Keyboard Operations 2 | ==================== 3 | 4 | POST /type_text 5 | ^^^^^^^^^^^^^^^ 6 | 7 | The ``/type_text`` endpoint simulates typing text at the current cursor location. 8 | 9 | **Request:** 10 | 11 | .. code-block:: json 12 | 13 | { 14 | "text": "Hello, world!", 15 | "min_interval": 0.05, 16 | "max_interval": 0.25 17 | } 18 | 19 | Attributes: 20 | 21 | - ``text`` (str): The text to be typed. 22 | - ``min_interval`` (float, optional): The minimum interval between key presses. Defaults to 0.05 seconds. 23 | - ``max_interval`` (float, optional): The maximum interval between key presses. Defaults to 0.25 seconds. 24 | 25 | **Response:** 26 | 27 | Returns a JSON response indicating the status of the operation. 28 | 29 | .. code-block:: json 30 | 31 | { 32 | "status": "success" 33 | } 34 | 35 | Possible ``status`` values: 36 | 37 | - ``success``: The text was successfully typed at the current cursor location. 38 | - ``error``: An error occurred while attempting to type the text. An additional ``message`` field will provide details about the error. 39 | 40 | POST /press_key 41 | ^^^^^^^^^^^^^^^ 42 | 43 | The ``/press_key`` endpoint simulates pressing a key on the keyboard. 44 | 45 | **Request:** 46 | 47 | .. code-block:: json 48 | 49 | { 50 | "key": "string" 51 | } 52 | 53 | **Response:** 54 | 55 | Returns a JSON response indicating the status of the operation. 56 | 57 | .. code-block:: json 58 | 59 | { 60 | "status": "success" 61 | } 62 | 63 | Possible ``status`` values: 64 | 65 | - ``success``: The key was successfully pressed. 66 | - ``error``: An error occurred while attempting to press the key. An additional ``message`` field will provide details about the error. 67 | -------------------------------------------------------------------------------- /theme/xsettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /pack.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default builder flags 4 | BUILD_QEMU=${BUILD_QEMU:-true} 5 | BUILD_EC2=${BUILD_EC2:-true} 6 | BUILD_GCE=${BUILD_GCE:-true} 7 | 8 | # Parse command-line arguments 9 | while [[ "$#" -gt 0 ]]; do 10 | case $1 in 11 | --no-qemu) BUILD_QEMU=false ;; 12 | --no-ec2) BUILD_EC2=false ;; 13 | --no-gce) BUILD_GCE=false ;; 14 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 15 | esac 16 | shift 17 | done 18 | 19 | # Fetch the current GCP project ID 20 | export GCP_PROJECT_ID=$(gcloud config get-value project) 21 | 22 | # Fetch the current AWS region 23 | export AWS_REGION=$(aws configure get region) 24 | 25 | # Check if GCP_PROJECT_ID is not empty 26 | if [ -z "$GCP_PROJECT_ID" ]; then 27 | echo "GCP Project ID could not be found. Ensure you're logged in to gcloud and have a project set." 28 | exit 1 29 | fi 30 | 31 | # Check if AWS_REGION is not empty 32 | if [ -z "$AWS_REGION" ]; then 33 | echo "AWS Region could not be found. Ensure you're logged in to aws cli and have a default region set." 34 | exit 1 35 | fi 36 | 37 | rm -rf ~/.cache/packer 38 | 39 | # Initialize Packer configuration 40 | packer init base.pkr.hcl 41 | 42 | # Generate a timestamp 43 | TIMESTAMP=$(date +%Y%m%d%H%M%S) 44 | 45 | # Define the base directory for VM outputs 46 | BASE_DIR=".vms/jammy" 47 | 48 | # Create a unique output directory with the timestamp 49 | OUTPUT_DIRECTORY="${BASE_DIR}/${TIMESTAMP}" 50 | 51 | # Ensure the directory exists 52 | mkdir -p "${BASE_DIR}" 53 | 54 | # Run Packer with the current GCP project ID, AWS region, generated timestamp for version, and builder flags 55 | PACKER_LOG=1 packer build \ 56 | -var 'gcp_project_id='"$GCP_PROJECT_ID" \ 57 | -var 'aws_region='"$AWS_REGION" \ 58 | -var 'version='"$TIMESTAMP" \ 59 | -var "output_directory=${OUTPUT_DIRECTORY}" \ 60 | -var 'build_qemu='"$BUILD_QEMU" \ 61 | -var 'build_ec2='"$BUILD_EC2" \ 62 | -var 'build_gce='"$BUILD_GCE" \ 63 | base.pkr.hcl 64 | 65 | # gsutil cp .vms/jammy/latest/jammy.qcow2 gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2 66 | # gsutil acl ch -u AllUsers:R gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2 -------------------------------------------------------------------------------- /pack_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default builder flags 4 | BUILD_QEMU=${BUILD_QEMU:-true} 5 | BUILD_EC2=${BUILD_EC2:-true} 6 | BUILD_GCE=${BUILD_GCE:-true} 7 | 8 | # Parse command-line arguments 9 | while [[ "$#" -gt 0 ]]; do 10 | case $1 in 11 | --no-qemu) BUILD_QEMU=false ;; 12 | --no-ec2) BUILD_EC2=false ;; 13 | --no-gce) BUILD_GCE=false ;; 14 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 15 | esac 16 | shift 17 | done 18 | 19 | # Fetch the current GCP project ID 20 | export GCP_PROJECT_ID=$(gcloud config get-value project) 21 | 22 | # Fetch the current AWS region 23 | export AWS_REGION=$(aws configure get region) 24 | 25 | # Check if GCP_PROJECT_ID is not empty 26 | if [ -z "$GCP_PROJECT_ID" ]; then 27 | echo "GCP Project ID could not be found. Ensure you're logged in to gcloud and have a project set." 28 | exit 1 29 | fi 30 | 31 | # Check if AWS_REGION is not empty 32 | if [ -z "$AWS_REGION" ]; then 33 | echo "AWS Region could not be found. Ensure you're logged in to aws cli and have a default region set." 34 | exit 1 35 | fi 36 | 37 | rm -rf ~/.cache/packer 38 | 39 | # Initialize Packer configuration 40 | packer init server.pkr.hcl 41 | 42 | # Generate a timestamp 43 | TIMESTAMP=$(date +%Y%m%d%H%M%S) 44 | 45 | # Define the base directory for VM outputs 46 | BASE_DIR=".vms/jammy" 47 | 48 | # Create a unique output directory with the timestamp 49 | OUTPUT_DIRECTORY="${BASE_DIR}/${TIMESTAMP}" 50 | 51 | # Ensure the directory exists 52 | mkdir -p "${BASE_DIR}" 53 | 54 | # Run Packer with the current GCP project ID, AWS region, generated timestamp for version, and builder flags 55 | PACKER_LOG=1 packer build \ 56 | -var 'gcp_project_id='"$GCP_PROJECT_ID" \ 57 | -var 'aws_region='"$AWS_REGION" \ 58 | -var 'version='"$TIMESTAMP" \ 59 | -var "output_directory=${OUTPUT_DIRECTORY}" \ 60 | -var 'build_qemu='"$BUILD_QEMU" \ 61 | -var 'build_ec2='"$BUILD_EC2" \ 62 | -var 'build_gce='"$BUILD_GCE" \ 63 | server.pkr.hcl 64 | 65 | # gsutil cp .vms/jammy/latest/jammy.qcow2 gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2 66 | # gsutil acl ch -u AllUsers:R gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2 -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - recordings 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-arm64 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | # Step: Set up Google Cloud authentication 17 | - name: Set up Google Cloud authentication 18 | uses: google-github-actions/auth@v2 19 | with: 20 | project_id: ${{ vars.PROJECT_ID }} 21 | credentials_json: ${{ secrets._GITHUB_ACTIONS_PUSH_IMAGES_KEY }} 22 | 23 | # Step 1: Set up Docker Buildx 24 | - name: Set up Docker Buildx 25 | run: | 26 | docker buildx create --name mybuilder --use 27 | docker buildx inspect --bootstrap 28 | 29 | # Step 2: Build and push for multiple architectures with caching 30 | - name: Build and Push 31 | run: | 32 | # Shorten the GitHub commit SHA (first 7 characters) 33 | SHORT_SHA=$(echo ${{ github.sha }} | cut -c1-7) 34 | echo "SHORT_SHA=${SHORT_SHA}" >> $GITHUB_ENV # Save SHORT_SHA to the environment for use in other steps 35 | 36 | docker buildx build \ 37 | --platform linux/arm64 \ 38 | -t us-docker.pkg.dev/${{ vars.PROJECT_ID }}/agentd/desktop-webtop:${{ env.SHORT_SHA }} \ 39 | --build-arg PYTHON_VERSION=3.12.0 \ 40 | --cache-from type=registry,ref=us-docker.pkg.dev/${{ vars.PROJECT_ID }}/agentd/desktop-webtop:buildcache \ 41 | --cache-to type=registry,ref=us-docker.pkg.dev/${{ vars.PROJECT_ID }}/agentd/desktop-webtop:buildcache,mode=max \ 42 | --push \ 43 | . 44 | 45 | # Step 4: Verify the multi-arch image 46 | - name: Verify Images 47 | run: | 48 | docker buildx imagetools inspect us-docker.pkg.dev/${{ vars.PROJECT_ID }}/agentd/desktop-webtop:${{ env.SHORT_SHA }} 49 | 50 | # Optional: Set timeout and machine type (not directly supported in GitHub Actions, but can be controlled via runners) 51 | # timeout: "3600s" # GitHub actions does not directly support timeouts in YAML, can be controlled at job level. 52 | # options: machineType: "N1_HIGHCPU_32" # You would need to use a custom runner for machine type configuration. 53 | -------------------------------------------------------------------------------- /cloudbuild_old.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | # Build for x86_64 3 | - name: "gcr.io/cloud-builders/docker" 4 | args: [ 5 | "build", 6 | "--no-cache", 7 | "--pull", 8 | "--platform", 9 | "linux/amd64", # Specify platform explicitly 10 | "-f", 11 | "Dockerfile.amd64", # Your custom Dockerfile for x86_64 12 | "-t", 13 | "gcr.io/$PROJECT_ID/agentd-webtop:latest-amd64", 14 | ".", 15 | ] 16 | 17 | # Set up QEMU for ARM builds 18 | - name: "gcr.io/cloud-builders/docker" 19 | args: 20 | [ 21 | "run", 22 | "--rm", 23 | "--privileged", 24 | "multiarch/qemu-user-static:register", 25 | "--reset", 26 | ] 27 | 28 | # Build for ARM64 29 | - name: "gcr.io/cloud-builders/docker" 30 | args: [ 31 | "build", 32 | "--no-cache", 33 | "--pull", 34 | "--platform", 35 | "linux/arm64", # Specify platform explicitly 36 | "-f", 37 | "Dockerfile.arm64", # Your custom Dockerfile for ARM64 38 | "-t", 39 | "gcr.io/$PROJECT_ID/agentd-webtop:latest-arm64", 40 | ".", 41 | ] 42 | 43 | # Create and push a multi-arch manifest 44 | - name: "gcr.io/cloud-builders/docker" 45 | entrypoint: "bash" 46 | args: 47 | - "-c" 48 | - | 49 | echo '{ "experimental": true }' | sudo tee /etc/docker/daemon.json 50 | sudo service docker restart 51 | docker manifest create gcr.io/$PROJECT_ID/agentd-webtop:latest \ 52 | gcr.io/$PROJECT_ID/agentd-webtop:latest-amd64 \ 53 | gcr.io/$PROJECT_ID/agentd-webtop:latest-arm64 54 | docker manifest annotate gcr.io/$PROJECT_ID/agentd-webtop:latest \ 55 | gcr.io/$PROJECT_ID/agentd-webtop:latest-amd64 --os linux --arch amd64 56 | docker manifest annotate gcr.io/$PROJECT_ID/agentd-webtop:latest \ 57 | gcr.io/$PROJECT_ID/agentd-webtop:latest-arm64 --os linux --arch arm64 58 | docker manifest push gcr.io/$PROJECT_ID/agentd-webtop:latest 59 | 60 | # Images to be pushed to Google Container Registry 61 | images: 62 | - "gcr.io/$PROJECT_ID/agentd-webtop:latest-amd64" 63 | - "gcr.io/$PROJECT_ID/agentd-webtop:latest-arm64" 64 | - "gcr.io/$PROJECT_ID/agentd-webtop:latest" 65 | 66 | # Set a longer timeout for the build process (default is 10m) 67 | timeout: "3600s" 68 | 69 | # Use a larger machine type for faster builds 70 | options: 71 | machineType: "N1_HIGHCPU_8" 72 | -------------------------------------------------------------------------------- /docs/info.rst: -------------------------------------------------------------------------------- 1 | System Information and Health 2 | ============================= 3 | 4 | GET "/" 5 | ^^^^^^^ 6 | 7 | The root endpoint returns a welcome message. This endpoint serves as a basic check to ensure 8 | the agent service is running and accessible. 9 | 10 | **Request:** 11 | 12 | No parameters required. 13 | 14 | **Response:** 15 | 16 | Returns a JSON response with a welcome message. 17 | 18 | .. code-block:: json 19 | 20 | { 21 | "message": "Agent in the shell" 22 | } 23 | 24 | GET /health 25 | ^^^^^^^^^^^ 26 | 27 | The ``/health`` endpoint returns a health check for the agent service. 28 | 29 | **Request:** 30 | 31 | No parameters required. 32 | 33 | **Response:** 34 | 35 | Returns a JSON response with a health check. 36 | 37 | .. code-block:: json 38 | 39 | { 40 | "status": "ok" 41 | } 42 | 43 | GET /info 44 | ^^^^^^^^^ 45 | 46 | The ``/info`` endpoint returns detailed information about the system where the agent is running. 47 | 48 | **Request:** 49 | 50 | No parameters required. 51 | 52 | **Response:** 53 | 54 | Returns a JSON response with the system information. 55 | 56 | .. code-block:: json 57 | 58 | { 59 | "last_activity_ts": 1625079600, 60 | "screen_size": { 61 | "x": 1920, 62 | "y": 1080 63 | }, 64 | "os_info": "Linux 5.8.0-53-generic", 65 | "code_version": "a1b2c3d4" 66 | } 67 | 68 | The response includes the last activity timestamp (``last_activity_ts``), screen size (``screen_size``), operating system information (``os_info``), and the current code version (``code_version``). 69 | 70 | GET /screen_size 71 | ^^^^^^^^^^^^^^^^ 72 | 73 | The ``/screen_size`` endpoint returns the current screen size of the system where the agent is running. 74 | 75 | **Request:** 76 | 77 | No parameters required. 78 | 79 | **Response:** 80 | 81 | Returns a JSON response with the screen size. 82 | 83 | .. code-block:: json 84 | 85 | { 86 | "x": 1920, 87 | "y": 1080 88 | } 89 | 90 | The response includes the width (``x``) and height (``y``) of the screen in pixels. 91 | 92 | GET /system_usage 93 | ^^^^^^^^^^^^^^^^^ 94 | 95 | This endpoint retrieves the current system usage statistics. 96 | 97 | **Response:** 98 | 99 | Returns a JSON response containing the current system usage statistics including CPU, memory, and disk usage percentages. 100 | 101 | .. code-block:: json 102 | 103 | { 104 | "cpu_percent": 23.5, 105 | "memory_percent": 74.2, 106 | "disk_percent": 55.3 107 | } 108 | 109 | This endpoint allows you to monitor the health and performance of the system where the agent is running. 110 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | First off, thank you for considering contributing to this project. It's people like you that make it such a great tool. 4 | 5 | ## Code of Conduct 6 | 7 | This project adheres to a Code of Conduct that we expect project participants to adhere to. Please read [the full text](CODE_OF_CONDUCT.md) so that you can understand what actions will and will not be tolerated. 8 | 9 | ## What we are looking for 10 | 11 | This is an open-source project, and we welcome contributions of all kinds: new features, bug fixes, documentation, examples, or enhancements to existing features. We are always thrilled to receive contributions from the community. 12 | 13 | ## How to contribute 14 | 15 | If you've never contributed to an open-source project before, here are a few steps to get you started: 16 | 17 | ### Reporting Issues 18 | 19 | Before submitting a bug report or feature request, check to make sure it hasn't already been submitted. You can search through existing issues and pull requests to see if someone has reported one similar to yours. 20 | 21 | When you are creating a bug report, please include as much detail as possible. 22 | 23 | ### Pull Requests 24 | 25 | - Fork the repository and create your branch from `main`. 26 | - If you've added code that should be tested, add tests. 27 | - If you've changed APIs, update the documentation. 28 | - Ensure the test suite passes. 29 | - Make sure your code lints. 30 | - Issue that pull request! 31 | 32 | ### Getting started 33 | 34 | For something that is bigger than a one or two-line fix: 35 | 36 | 1. Create your own fork of the code. 37 | 2. Do the changes in your fork. 38 | 3. If you like the change and think the project could use it: 39 | - Be sure you have followed the code style for the project. 40 | - Note the Code of Conduct. 41 | - Send a pull request. 42 | 43 | ## How to report a bug 44 | 45 | If you find a security vulnerability, do NOT open an issue. Email github@kentauros.ai instead. 46 | 47 | In order to help us understand and resolve your issue quickly, please include as much information as possible, including: 48 | 49 | - A quick summary and/or background 50 | - Steps to reproduce 51 | - Be specific! 52 | - Give a sample code if you can. 53 | - What you expected would happen 54 | - What actually happens 55 | - Notes (possibly including why you think this might be happening or stuff you tried that didn't work) 56 | 57 | People *love* thorough bug reports. I'm not even kidding. 58 | 59 | ## How to suggest a feature or enhancement 60 | 61 | If you find yourself wishing for a feature that doesn't exist in the project, you are probably not alone. There are bound to be others out there with similar needs. Open an issue on our issues list on GitHub, which describes the feature you would like to see, why you need it, and how it should work. 62 | 63 | ## Code review process 64 | 65 | The core team looks at Pull Requests on a regular basis in a bi-weekly triage meeting. After feedback has been given, we expect responses within two weeks. After two weeks, we may close the pull request if it isn't showing any activity. 66 | 67 | ## Community 68 | 69 | Discussions about the project take place in this repository's Issues and Pull Requests sections. Anybody is welcome to join these conversations. 70 | 71 | Wherever possible, we use GitHub to discuss changes and keep the decision-making process open. 72 | 73 | ## Thank you! 74 | 75 | Thank you for contributing! 76 | 77 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VMS_DIR := .vms 2 | JAMMY := $(VMS_DIR)/jammy.img 3 | META_DIR := ./meta 4 | TEMPLATE_FILE := user-data.tpl 5 | OUTPUT_FILE := $(META_DIR)/user-data 6 | SSH_KEY_FILE := $(shell [ -f ~/.ssh/id_rsa.pub ] && echo ~/.ssh/id_rsa.pub || echo ~/.ssh/id_ed25519.pub) 7 | JAMMY_LATEST := ./.vms/jammy/latest/jammy.qcow2 8 | 9 | $(JAMMY): 10 | @mkdir -p $(VMS_DIR) 11 | @test -f $(JAMMY) || (echo "Downloading jammy..." && curl -o $(JAMMY) https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img && echo "Download complete.") 12 | qemu-img resize $(JAMMY) +10G 13 | 14 | .PHONY: download-jammy 15 | download-jammy: $(JAMMY) 16 | 17 | .PHONY: prepare-user-data 18 | prepare-user-data: 19 | @mkdir -p $(META_DIR) 20 | @SSH_KEY=$$(cat $(SSH_KEY_FILE)); \ 21 | sed "s|{{ ssh_public_key }}|$$SSH_KEY|" $(TEMPLATE_FILE) > $(OUTPUT_FILE) 22 | @echo "User-data file prepared at $(OUTPUT_FILE)." 23 | 24 | .PHONY: run-meta 25 | run-meta: 26 | python3 -m http.server 8060 --directory ./meta 27 | 28 | .PHONY: run-jammy 29 | run-jammy: prepare-user-data 30 | xorriso -as mkisofs -o cidata.iso -V "cidata" -J -r -iso-level 3 meta/ 31 | qemu-system-x86_64 -nographic -hda $(JAMMY_LATEST) \ 32 | -m 4G -smp 2 -netdev user,id=vmnet,hostfwd=tcp::6080-:6080,hostfwd=tcp::8000-:8000,hostfwd=tcp::2222-:22 \ 33 | -device e1000,netdev=vmnet -cdrom cidata.iso 34 | # -smbios type=1,serial=ds='nocloud;s=http://10.0.2.2:8060/'; 35 | 36 | .PHONY: clean 37 | clean: 38 | rm -rf $(VMS_DIR) 39 | 40 | .PHONY: pack 41 | pack: user-data 42 | ./pack.sh 43 | 44 | .PHONY: user-data 45 | user-data: 46 | # hdiutil makehybrid -o cidata.iso -hfs -joliet -iso -default-volume-name cidata root_meta/ 47 | xorriso -as mkisofs -o cidata_root.iso -V "cidata" -J -r -iso-level 3 root_meta/ 48 | 49 | .PHONY: push-latest 50 | push-latest: 51 | gsutil cp .vms/jammy/latest/jammy.qcow2 gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2 52 | gsutil acl ch -u AllUsers:R gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2 53 | 54 | .PHONY: exp-deps 55 | exp-deps: 56 | poetry export -f requirements.txt --output requirements.txt --without-hashes 57 | 58 | .PHONY: run-latest-auth 59 | run-latest-auth: 60 | docker run -d \ 61 | --platform linux/arm64 \ 62 | --name=webtop \ 63 | --security-opt seccomp=unconfined \ 64 | -e PUID=1000 \ 65 | -e PGID=1000 \ 66 | -e CUSTOM_USER=agentd \ 67 | -e PASSWORD=agentd \ 68 | -e TZ=Etc/UTC \ 69 | -p 3000:3000 \ 70 | -p 3001:3001 \ 71 | -p 8000:8000 \ 72 | --restart unless-stopped \ 73 | us-docker.pkg.dev/agentsea-dev/agentd/desktop-webtop:efc7aed 74 | 75 | .PHONY: run-latest 76 | run-latest: 77 | docker run -d \ 78 | --platform linux/arm64 \ 79 | --name=webtop \ 80 | --security-opt seccomp=unconfined \ 81 | -e TZ=Etc/UTC \ 82 | -p 3000:3000 \ 83 | -p 3001:3001 \ 84 | -p 8000:8000 \ 85 | --restart unless-stopped \ 86 | us-docker.pkg.dev/agentsea-dev/agentd/desktop-webtop:773b6aa 87 | # us-docker.pkg.dev/agentsea-dev/agentd/desktop-webtop:latest 88 | 89 | 90 | .PHONY: dev 91 | dev: 92 | docker run -d \ 93 | --platform linux/arm64 \ 94 | --name=webtop \ 95 | --security-opt seccomp=unconfined \ 96 | -e TZ=Etc/UTC \ 97 | -p 3000:3000 \ 98 | -p 3001:3001 \ 99 | -p 8000:8000 \ 100 | --restart unless-stopped \ 101 | -v $(shell pwd)/agentd:/config/app/agentd \ 102 | us-docker.pkg.dev/agentsea-dev/agentd/desktop-webtop:latest -------------------------------------------------------------------------------- /agentd/firefox.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import subprocess 4 | 5 | 6 | def is_firefox_running() -> list: 7 | """ 8 | Checks if Firefox is running and returns a list of PIDs. 9 | """ 10 | try: 11 | output = subprocess.check_output(["pgrep", "-f", "firefox"]) 12 | return [int(pid) for pid in output.decode().strip().split("\n")] 13 | except subprocess.CalledProcessError: 14 | return [] 15 | 16 | 17 | def is_firefox_window_open(): 18 | try: 19 | output = subprocess.check_output( 20 | ["xdotool", "search", "--onlyvisible", "--class", "firefox"] 21 | ) 22 | return bool(output.strip()) 23 | except subprocess.CalledProcessError: 24 | return False 25 | 26 | 27 | def gracefully_terminate_firefox(pids: list): 28 | """ 29 | Attempts to gracefully terminate Firefox processes given their PIDs. 30 | """ 31 | for pid in pids: 32 | try: 33 | os.kill(pid, signal.SIGTERM) 34 | print(f"Sent SIGTERM to Firefox process {pid}.") 35 | except ProcessLookupError: 36 | print(f"Firefox process {pid} not found.") 37 | except Exception as e: 38 | print(f"Error terminating Firefox process {pid}: {e}") 39 | 40 | 41 | def maximize_firefox_window(): 42 | """ 43 | Maximizes the Firefox window by resizing it to the full screen size. 44 | """ 45 | try: 46 | # Get the window ID(s) of the Firefox window(s) 47 | window_ids_output = subprocess.check_output( 48 | ["xdotool", "search", "--onlyvisible", "--class", "firefox"] 49 | ) 50 | window_ids = window_ids_output.decode("utf-8").split() 51 | 52 | # Get the display geometry (screen width and height) 53 | geometry_output = subprocess.check_output(["xdotool", "getdisplaygeometry"]) 54 | screen_width, screen_height = geometry_output.decode("utf-8").split() 55 | 56 | for window_id in window_ids: 57 | # Activate the window 58 | subprocess.run( 59 | ["xdotool", "windowactivate", "--sync", window_id], check=True 60 | ) 61 | 62 | # Resize the window to match the screen dimensions 63 | subprocess.run( 64 | ["xdotool", "windowsize", window_id, screen_width, screen_height], 65 | check=True, 66 | ) 67 | 68 | # Move the window to the top-left corner 69 | subprocess.run(["xdotool", "windowmove", window_id, "0", "0"], check=True) 70 | 71 | print(f"Maximized Firefox window with window ID {window_id}") 72 | except subprocess.CalledProcessError as e: 73 | print(f"Failed to maximize Firefox window: {e}") 74 | 75 | 76 | def close_firefox_window(): 77 | """ 78 | Closes the Firefox window gracefully using xdotool's windowclose command. 79 | """ 80 | try: 81 | # Get the window ID(s) of the Firefox window(s) 82 | window_ids_output = subprocess.check_output( 83 | ["xdotool", "search", "--onlyvisible", "--class", "firefox"] 84 | ) 85 | window_ids = window_ids_output.decode("utf-8").split() 86 | 87 | for window_id in window_ids: 88 | # Close the window 89 | subprocess.run(["xdotool", "windowclose", window_id], check=True) 90 | 91 | print(f"Closed Firefox window with window ID {window_id}") 92 | except subprocess.CalledProcessError as e: 93 | print(f"Failed to close Firefox window: {e}") 94 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Using welcoming and inclusive language 12 | - Being respectful of differing viewpoints and experiences 13 | - Gracefully accepting constructive criticism 14 | - Focusing on what is best for the community 15 | - Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | - Trolling, insulting/derogatory comments, and personal or political attacks 21 | - Public or private harassment 22 | - Publishing others' private information, such as a physical or email address, without explicit permission 23 | - Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies within all project spaces, including GitHub, and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at github@kentauros.ai. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality regarding the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 44 | 45 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). 46 | 47 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 48 | -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | # Set up QEMU for multi-architecture support 3 | # - name: "gcr.io/cloud-builders/docker" 4 | # entrypoint: "bash" 5 | # args: 6 | # - "-c" 7 | # - | 8 | # docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 9 | 10 | # Set up Docker Buildx 11 | - name: "gcr.io/cloud-builders/docker" 12 | entrypoint: "bash" 13 | args: 14 | - "-c" 15 | - | 16 | docker buildx create --name mybuilder --use 17 | docker buildx inspect --bootstrap 18 | 19 | # Build and push for amd64 with caching 20 | - name: "gcr.io/cloud-builders/docker" 21 | id: Build and Push 22 | entrypoint: "bash" 23 | args: 24 | - "-c" 25 | - | 26 | docker buildx build \ 27 | --platform linux/arm64,linux/amd64 \ 28 | -t us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:${SHORT_SHA} \ 29 | --build-arg PYTHON_VERSION=3.12.0 \ 30 | --cache-from type=registry,ref=us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:buildcache \ 31 | --cache-to type=registry,ref=us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:buildcache,mode=max \ 32 | --push \ 33 | . 34 | 35 | # Build and push for amd64 with caching 36 | - name: "gcr.io/cloud-builders/docker" 37 | id: Build and Push Loaded 38 | entrypoint: "bash" 39 | args: 40 | - "-c" 41 | - | 42 | docker buildx build \ 43 | --platform linux/arm64,linux/amd64 \ 44 | -f Dockerfile.loaded \ 45 | -t us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop-loaded:${SHORT_SHA} \ 46 | --build-arg PYTHON_VERSION=3.12.0 \ 47 | --cache-from type=registry,ref=us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop-loaded:buildcache \ 48 | --cache-to type=registry,ref=us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop-loaded:buildcache,mode=max \ 49 | --push \ 50 | . 51 | 52 | # - name: "gcr.io/cloud-builders/docker" 53 | # id: Build and Push AMD 54 | # entrypoint: "bash" 55 | # args: 56 | # - "-c" 57 | # - | 58 | # docker buildx build \ 59 | # --platform linux/amd64 \ 60 | # -t us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:${SHORT_SHA} \ 61 | # --build-arg PYTHON_VERSION=3.12.0 \ 62 | # --cache-from type=registry,ref=us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:buildcache \ 63 | # --cache-to type=registry,ref=us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:buildcache,mode=max \ 64 | # --push \ 65 | # . 66 | 67 | # Verify the multi-arch image 68 | - name: "gcr.io/cloud-builders/docker" 69 | id: Verify Images 70 | entrypoint: "bash" 71 | args: 72 | - "-c" 73 | - | 74 | docker buildx imagetools inspect us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:${SHORT_SHA} 75 | if [ "$BRANCH_NAME" == "main" ]; then 76 | docker buildx imagetools inspect us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop:latest 77 | fi 78 | 79 | # Verify the multi-arch image 80 | - name: "gcr.io/cloud-builders/docker" 81 | id: Verify Images Loaded 82 | entrypoint: "bash" 83 | args: 84 | - "-c" 85 | - | 86 | docker buildx imagetools inspect us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop-loaded:${SHORT_SHA} 87 | if [ "$BRANCH_NAME" == "main" ]; then 88 | docker buildx imagetools inspect us-docker.pkg.dev/$PROJECT_ID/agentd/desktop-webtop-loaded:latest 89 | fi 90 | 91 | timeout: "3600s" 92 | 93 | options: 94 | machineType: "N1_HIGHCPU_32" 95 | -------------------------------------------------------------------------------- /agentd/models.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | from skillpacks import V1EnvState, V1Action 3 | from pydantic import BaseModel 4 | 5 | 6 | class OpenURLModel(BaseModel): 7 | url: str 8 | 9 | 10 | class ScreenSizeModel(BaseModel): 11 | x: int 12 | y: int 13 | 14 | 15 | class SystemInfoModel(BaseModel): 16 | last_activity_ts: int | None 17 | screen_size: ScreenSizeModel 18 | os_info: str 19 | code_version: str | None 20 | 21 | 22 | class MoveMouseModel(BaseModel): 23 | x: int 24 | y: int 25 | duration: float = 1.0 26 | tween: str = "easeInOutQuad" 27 | 28 | 29 | class ClickModel(BaseModel): 30 | button: str = "left" 31 | location: Optional[MoveMouseModel] = None 32 | 33 | 34 | class TypeTextModel(BaseModel): 35 | text: str 36 | min_interval: float = 0.05 37 | max_interval: float = 0.25 38 | 39 | 40 | class PressKeyModel(BaseModel): 41 | key: str 42 | 43 | 44 | class useSecretRequest(BaseModel): 45 | token: str 46 | server_address: str 47 | name: str 48 | field: str 49 | 50 | class getSecretRequest(BaseModel): 51 | token: str 52 | server_address: str 53 | owner_id: str 54 | 55 | class PressKeysModel(BaseModel): 56 | keys: List[str] 57 | 58 | 59 | class ScrollModel(BaseModel): 60 | clicks: int = 3 61 | 62 | 63 | class DragMouseModel(BaseModel): 64 | x: int 65 | y: int 66 | 67 | 68 | class ScreenshotResponseModel(BaseModel): 69 | status: str 70 | images: List[str] 71 | 72 | 73 | class CoordinatesModel(BaseModel): 74 | x: int 75 | y: int 76 | 77 | 78 | class RecordRequest(BaseModel): 79 | description: Optional[str] = None 80 | task_id: Optional[str] = None 81 | skill_id: Optional[str] = None 82 | token: str 83 | server_address: str 84 | owner_id: str 85 | 86 | class StopRequest(BaseModel): 87 | result: Optional[str] = None 88 | comment: Optional[str] = None 89 | 90 | 91 | class RecordResponse(BaseModel): 92 | task_id: str 93 | 94 | 95 | class ClickData(BaseModel): 96 | button: str 97 | pressed: bool 98 | 99 | 100 | class KeyData(BaseModel): 101 | key: str 102 | 103 | 104 | class TextData(BaseModel): 105 | text: str 106 | 107 | 108 | class ScrollData(BaseModel): 109 | dx: int 110 | dy: int 111 | 112 | class ActionDetails(BaseModel): 113 | x: float 114 | y: float 115 | action: V1Action 116 | end_stamp: Optional[float] 117 | start_state: Optional[V1EnvState] 118 | event_order: int 119 | 120 | class RecordedEvent(BaseModel): 121 | id: str 122 | type: str 123 | timestamp: float 124 | coordinates: CoordinatesModel 125 | before_screenshot_path: Optional[str] = None 126 | after_screenshot_path: Optional[str] = None 127 | before_screenshot_b64: Optional[str] = None 128 | after_screenshot_b64: Optional[str] = None 129 | click_data: Optional[ClickData] = None 130 | key_data: Optional[KeyData] = None 131 | scroll_data: Optional[ScrollData] = None 132 | text_data: Optional[TextData] = None 133 | 134 | 135 | class Recording(BaseModel): 136 | id: str 137 | description: Optional[str] = None 138 | start_time: float 139 | end_time: float 140 | events: List[RecordedEvent] = [] 141 | task_id: str 142 | 143 | 144 | class Recordings(BaseModel): 145 | recordings: List[str] 146 | 147 | 148 | class Actions(BaseModel): 149 | actions: List[Dict[str, Any]] 150 | 151 | 152 | class SystemUsageModel(BaseModel): 153 | cpu_percent: float 154 | memory_percent: float 155 | disk_percent: float 156 | -------------------------------------------------------------------------------- /agentd/celery_worker.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | import requests 3 | from skillpacks import V1ActionEvent, ActionEvent 4 | from celery.app.task import Task 5 | from taskara.task import V1TaskUpdate, V1Task 6 | from taskara.task import Task as App_task 7 | Task.__class_getitem__ = classmethod(lambda cls, *args, **kwargs: cls) # type: ignore[attr-defined] 8 | 9 | 10 | 11 | # Create a new Celery application instance with the filesystem as the broker 12 | celery_app = Celery('send_actions', broker='redis://localhost:6379/0', backend='file:///config/app/celery') 13 | 14 | celery_app.conf.update( 15 | worker_concurrency=1, # Set concurrency to 1 we need to either change the data model or enable object locking to do this. 16 | task_serializer='json', # Specify the task serializer if needed 17 | worker_max_memory_per_child=512000000 18 | ) 19 | 20 | 21 | @celery_app.task 22 | def send_action(taskID, auth_token, owner_id, v1Task: dict, v1actionEvent: dict): 23 | print("send_action: starting send action function in worker") 24 | action = ActionEvent.from_v1(V1ActionEvent(**v1actionEvent)) 25 | print(f"send_action: action {action.id} variable created in worker process") 26 | task = App_task.from_v1_remote_actions(V1Task(**v1Task), owner_id=owner_id, auth_token=auth_token) 27 | print(f"send_action: task {task.id} variable created in worker process") 28 | try: 29 | task.record_action_event(action) 30 | except Exception as e: 31 | print(f"send_action: record_action_event failed due to error: {e} for task ID: {taskID} and action {action.action.model_dump()} and event order {action.event_order}") 32 | print(f"send_action: finished sending action {action.id} for task {task.id}") 33 | return f"send_action: finished sending action {action.id} for task {task.id}" 34 | 35 | @celery_app.task 36 | def update_task(taskID, remote_address, auth_token, v1taskupdate: dict): 37 | print("update_task: starting update task function in worker") 38 | print(f"update_task: task: {taskID} with be updated with {v1taskupdate}") 39 | # Ensure the v1taskupdate dictionary matches the Pydantic model 40 | try: 41 | updateData = V1TaskUpdate(**v1taskupdate) 42 | except Exception as e: 43 | print(f"update_task: Error while parsing update data: {e}") 44 | raise 45 | 46 | print(f"update_task: Task {taskID} update {updateData.model_dump()} created in worker process") 47 | 48 | headers = {} 49 | if auth_token: 50 | headers["Authorization"] = f"Bearer {auth_token}" 51 | else: 52 | print("update_task: Error: no auth token!!") 53 | 54 | url = f"{remote_address}/v1/tasks/{taskID}" 55 | print(url, headers, "update_task: url and headers") 56 | try: 57 | response = requests.put(url, json=updateData.model_dump(), headers=headers) 58 | try: 59 | response.raise_for_status() 60 | except requests.exceptions.HTTPError as e: 61 | 62 | print(f"update_task: HTTP Error: {e}") 63 | print(f"update_task: Status Code: {response.status_code}") 64 | try: 65 | print(f"update_task: Response Body: {response.json()}") 66 | except ValueError: 67 | print(f"update_task: Raw Response: {response.text}") 68 | raise 69 | print(f"update_task: response: {response.__dict__}") 70 | print(f"update_task: response.status_code: {response.status_code}") 71 | try: 72 | response_json = response.json() 73 | print(f"update_task: response_json: {response_json}") 74 | return response_json 75 | except ValueError: 76 | print(f"update_task: Raw Response: {response.text}") 77 | return None 78 | 79 | except requests.RequestException as e: 80 | print(f"update_task: Request failed: {e}") 81 | raise e 82 | 83 | return "Something went wrong" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | screenshots/ 2 | .recordings/ 3 | .DS_Store 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | .vms 167 | meta/user-data 168 | log 169 | output-* 170 | artifacts 171 | cidata.iso 172 | cidata_root.iso 173 | .data -------------------------------------------------------------------------------- /agentd/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pwd 3 | import subprocess 4 | import threading 5 | import queue 6 | 7 | 8 | def run_as_user(command, username): 9 | # Get the user's UID and GID 10 | pw_record = pwd.getpwnam(username) 11 | user_uid = pw_record.pw_uid 12 | user_gid = pw_record.pw_gid 13 | 14 | def preexec_fn(): 15 | os.setgid(user_gid) 16 | os.setuid(user_uid) 17 | 18 | return subprocess.Popen(command, preexec_fn=preexec_fn) 19 | 20 | def log_subprocess_output(pipe, sub_process): 21 | for line in iter(pipe.readline, b''): # b'\n'-separated lines 22 | if line: # Check if the line is not empty 23 | print(f'from subprocess: {sub_process} got line: {line.strip()}', flush=True) 24 | 25 | class OrderLock: 26 | """ 27 | A lock that ensures threads acquire the lock in FIFO (first-in, first-out) order 28 | using queue.Queue(). Each thread places an Event in the queue and waits for 29 | its Event to be set before proceeding to acquire the internal lock. 30 | 31 | This approach automates queue management, removing the need for manual 32 | Condition objects and notify/wait calls. 33 | 34 | Usage: 35 | order_lock = OrderLock() 36 | 37 | def worker(i): 38 | print(f"Worker {i} waiting for lock") 39 | with order_lock: 40 | print(f"Worker {i} acquired lock") 41 | time.sleep(1) 42 | print(f"Worker {i} released lock") 43 | 44 | threads = [threading.Thread(target=worker, args=(i,)) for i in range(5)] 45 | for t in threads: 46 | t.start() 47 | for t in threads: 48 | t.join() 49 | 50 | Behavior: 51 | 1. Thread enqueues a threading.Event (thread’s place in line). 52 | 2. If it’s the only event in the queue, it is immediately set. 53 | 3. The thread waits on the Event until it is set, then acquires the lock. 54 | 4. On release, the thread dequeues its own Event and sets the next Event 55 | in the queue (if any), transferring ownership of the lock to that thread. 56 | 57 | Note: 58 | - This enforces strict FIFO ordering. 59 | - If you don’t need ordering, a regular threading.Lock is simpler/faster. 60 | - If you need complex ordering (e.g., priority), you’ll need a more advanced approach. 61 | """ 62 | 63 | def __init__(self): 64 | # Lock for the shared resource 65 | self._resource_lock = threading.Lock() 66 | # A queue of Event objects, one per waiting thread 67 | self._queue = queue.Queue() 68 | # Internal lock to ensure enqueue/dequeue operations are atomic 69 | self._queue_lock = threading.Lock() 70 | 71 | def acquire(self): 72 | """Acquire the lock in FIFO order.""" 73 | my_event = threading.Event() 74 | 75 | with self._queue_lock: 76 | self._queue.put(my_event) 77 | # If this is the only event in the queue, allow the thread to proceed 78 | if self._queue.qsize() == 1: 79 | my_event.set() 80 | 81 | # Block until my_event is set, meaning it's this thread's turn 82 | my_event.wait() 83 | self._resource_lock.acquire() 84 | 85 | def release(self): 86 | """Release the lock, notify the next waiting thread (if any).""" 87 | self._resource_lock.release() 88 | 89 | with self._queue_lock: 90 | # Remove this thread’s event from the queue 91 | finished_event = self._queue.get() 92 | # Optional: sanity check 93 | # assert finished_event.is_set() 94 | 95 | # If there is another thread waiting, set its event 96 | if not self._queue.empty(): 97 | next_event = self._queue.queue[0] # Peek at the next event 98 | next_event.set() 99 | 100 | def __enter__(self): 101 | self.acquire() 102 | return self 103 | 104 | def __exit__(self, exc_type, exc_val, exc_tb): 105 | self.release() 106 | -------------------------------------------------------------------------------- /theme/xfce4-panel.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $EUID -ne 0 ]]; then 4 | echo "This script must be run as root (or with sudo). Exiting..." 5 | exit 1 6 | fi 7 | 8 | echo "creating user..." 9 | adduser --disabled-password --gecos '' agentsea 10 | touch /home/agentsea/.bashrc 11 | touch /home/agentsea/.Xauthority 12 | chown -R agentsea:agentsea /home/agentsea 13 | echo 'agentsea ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/agentsea 14 | 15 | 16 | echo "Configuring .xprofile to disable screen saver..." 17 | cat > /home/agentsea/.xprofile < /home/agentsea/.config/lxqt/lxqt-powermanagement.conf 31 | chown -R agentsea:agentsea /home/agentsea/.config 32 | 33 | echo "installing base packages..." 34 | add-apt-repository universe 35 | apt-get update 36 | apt-get install -y xvfb x11vnc websockify python3-pip python3-dev python3-venv python3-tk software-properties-common ntp dbus-x11 openbox menu lxqt sddm lxqt-session wmctrl xdotool 37 | apt-get remove -y xscreensaver 38 | 39 | echo 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1001/bus"' >> /home/agentsea/.profile 40 | 41 | echo "installing chromium" 42 | snap install chromium 43 | update-alternatives --install /usr/bin/x-www-browser x-www-browser /snap/bin/chromium 200 44 | update-alternatives --set x-www-browser /snap/bin/chromium 45 | 46 | echo "configuring lxqt" 47 | mkdir -p /etc/sddm.conf.d 48 | echo "[Autologin]" > /etc/sddm.conf.d/autologin.conf 49 | echo "User=agentsea" >> /etc/sddm.conf.d/autologin.conf 50 | echo "Session=lxqt.desktop" >> /etc/sddm.conf.d/autologin.conf 51 | 52 | 53 | echo -e "[Session]\nwindow_manager=openbox" > /home/agentsea/.config/lxqt/session.conf 54 | 55 | mkdir -p /home/agentsea/.config/openbox 56 | cp /etc/xdg/openbox/rc.xml /home/agentsea/.config/openbox/ 57 | 58 | chown -R agentsea:agentsea /home/agentsea/.config 59 | 60 | su agentsea -c "xauth generate :99 . trusted" 61 | su agentsea -c "bash install_deps.sh" 62 | 63 | # Disable screen saver and DPMS 64 | echo "Disabling screen saver and DPMS..." 65 | su agentsea -c "xset s off" 66 | # su agentsea -c "xset -dpms" 67 | 68 | echo "copying services..." 69 | cp ./conf/agentd.service /etc/systemd/system/agentd.service 70 | cp ./conf/websockify.service /etc/systemd/system/websockify.service 71 | cp ./conf/x11vnc.service /lib/systemd/system/x11vnc.service 72 | cp ./conf/xvfb.service /lib/systemd/system/xvfb.service 73 | cp ./conf/openbox.service /lib/systemd/system/openbox.service 74 | cp ./conf/lxqt.service /lib/systemd/system/lxqt.service 75 | 76 | loginctl enable-linger agentsea 77 | 78 | echo "enabling services..." 79 | systemctl daemon-reload 80 | systemctl enable agentd.service 81 | systemctl enable websockify.service 82 | systemctl enable x11vnc.service 83 | systemctl enable xvfb.service 84 | systemctl enable openbox.service 85 | systemctl enable lxqt.service 86 | systemctl enable ntp 87 | 88 | restart_service_and_log() { 89 | local service_name="$1" 90 | echo "Restarting $service_name..." 91 | if systemctl restart "$service_name"; then 92 | echo "$service_name restarted successfully." 93 | else 94 | echo "Failed to restart $service_name. Here are the last 20 log lines:" 95 | journalctl -u "$service_name" --no-pager -n 20 96 | fi 97 | } 98 | 99 | echo "restarting services..." 100 | restart_service_and_log agentd.service 101 | restart_service_and_log websockify.service 102 | restart_service_and_log x11vnc.service 103 | restart_service_and_log xvfb.service 104 | restart_service_and_log openbox.service 105 | restart_service_and_log lxqt.service 106 | restart_service_and_log ntp 107 | 108 | echo "disabling firewall..." 109 | ufw disable 110 | 111 | su - agentsea -c 'bash -l -c " 112 | while [ -z \$(pgrep -u agentsea lxqt-session) ]; do 113 | echo Waiting for LXQt session to start... 114 | sleep 2 115 | done 116 | 117 | echo LXQt session started, setting icon as trusted... 118 | "' 119 | 120 | 121 | echo "Adding Firefox icon to desktop..." 122 | mkdir -p /home/agentsea/Desktop 123 | cat > /home/agentsea/Desktop/firefox.desktop < 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /base.pkr.hcl: -------------------------------------------------------------------------------- 1 | packer { 2 | required_plugins { 3 | googlecompute = { 4 | source = "github.com/hashicorp/googlecompute" 5 | version = "~> 1" 6 | } 7 | } 8 | } 9 | 10 | packer { 11 | required_plugins { 12 | amazon = { 13 | source = "github.com/hashicorp/amazon" 14 | version = "~> 1" 15 | } 16 | } 17 | } 18 | variable "build_qemu" { 19 | type = bool 20 | default = true 21 | } 22 | 23 | variable "build_ec2" { 24 | type = bool 25 | default = true 26 | } 27 | 28 | variable "build_gce" { 29 | type = bool 30 | default = true 31 | } 32 | 33 | variable "gcp_project_id" { 34 | type = string 35 | default = "your-gcp-project-id" 36 | } 37 | 38 | variable "aws_region" { 39 | type = string 40 | default = "your-aws-region" 41 | } 42 | 43 | variable "output_directory" { 44 | type = string 45 | default = "output-ubuntu" 46 | } 47 | 48 | variable "cpu" { 49 | type = string 50 | default = "2" 51 | } 52 | 53 | variable "disk_size" { 54 | type = string 55 | default = "40000" 56 | } 57 | 58 | variable "headless" { 59 | type = string 60 | default = "true" 61 | } 62 | 63 | variable "iso_checksum" { 64 | type = string 65 | default = "d277aaac7a56ec02ea026a02d92fde2fc358048431749cb1031b62380cc93584" 66 | } 67 | 68 | variable "iso_url" { 69 | type = string 70 | default = "https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img" 71 | } 72 | 73 | variable "name" { 74 | type = string 75 | default = "jammy" 76 | } 77 | 78 | variable "ram" { 79 | type = string 80 | default = "2048" 81 | } 82 | 83 | variable "ssh_password" { 84 | type = string 85 | default = "ubuntu" 86 | } 87 | 88 | variable "ssh_username" { 89 | type = string 90 | default = "ubuntu" 91 | } 92 | 93 | variable "version" { 94 | type = string 95 | default = "" 96 | } 97 | 98 | variable "format" { 99 | type = string 100 | default = "qcow2" 101 | } 102 | 103 | source "qemu" "jammy" { 104 | # accelerator = "kvm" 105 | boot_command = [] 106 | disk_compression = true 107 | disk_interface = "virtio" 108 | disk_image = true 109 | disk_size = var.disk_size 110 | format = var.format 111 | headless = var.headless 112 | iso_checksum = var.iso_checksum 113 | iso_url = var.iso_url 114 | net_device = "virtio-net" 115 | output_directory = "${var.output_directory}" 116 | qemuargs = [ 117 | ["-m", "${var.ram}M"], 118 | ["-smp", "${var.cpu}"], 119 | ["-cdrom", "cidata_root.iso"] 120 | ] 121 | communicator = "ssh" 122 | shutdown_command = "echo '${var.ssh_password}' | sudo -S shutdown -P now" 123 | ssh_password = var.ssh_password 124 | ssh_username = var.ssh_username 125 | ssh_timeout = "10m" 126 | } 127 | 128 | source "amazon-ebs" "jammy" { 129 | ami_name = "agentd-ubuntu-22.04-${formatdate("YYYYMMDDHHmmss", timestamp())}" 130 | instance_type = "t2.micro" 131 | region = var.aws_region 132 | source_ami_filter { 133 | filters = { 134 | name = "ubuntu/images/*ubuntu-jammy-22.04-amd64-server-*" 135 | root-device-type = "ebs" 136 | virtualization-type = "hvm" 137 | } 138 | owners = ["099720109477"] # Ubuntu's owner ID 139 | most_recent = true 140 | } 141 | ssh_username = "ubuntu" 142 | } 143 | 144 | source "googlecompute" "ubuntu" { 145 | project_id = var.gcp_project_id 146 | source_image_family = "ubuntu-2204-lts" 147 | zone = "us-central1-a" 148 | ssh_username = "ubuntu" 149 | image_name = "agentd-ubuntu-22-04-${formatdate("YYYYMMDDHHmmss", timestamp())}" 150 | } 151 | 152 | build { 153 | // dynamic "source" { 154 | // for_each = var.build_qemu ? ["source.qemu.jammy"] : [] 155 | // content { 156 | // source = source.value 157 | // } 158 | // } 159 | 160 | // dynamic "source" { 161 | // for_each = var.build_ec2 ? ["source.amazon-ebs.jammy"] : [] 162 | // content { 163 | // source = source.value 164 | // } 165 | // } 166 | 167 | // dynamic "source" { 168 | // for_each = var.build_gce ? ["source.googlecompute.ubuntu"] : [] 169 | // content { 170 | // source = source.value 171 | // } 172 | // } 173 | sources = [ 174 | "source.qemu.jammy", 175 | "source.amazon-ebs.jammy", 176 | "source.googlecompute.ubuntu", 177 | ] 178 | 179 | 180 | provisioner "shell" { 181 | inline = [ 182 | # Run install script 183 | "curl -sSL https://raw.githubusercontent.com/agentsea/agentd/main/remote_install.sh | sudo bash", 184 | 185 | # Prepare cloud-init to run on next boot for the QEMU image 186 | "sudo cloud-init clean --logs", 187 | "sudo truncate -s 0 /etc/machine-id", 188 | "sudo rm /var/lib/dbus/machine-id", 189 | "sudo ln -s /etc/machine-id /var/lib/dbus/machine-id", 190 | 191 | # Disable SSH password authentication 192 | "sudo sed -i 's/^#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config", 193 | "sudo sed -i 's/^PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config", 194 | "sudo systemctl restart sshd", 195 | ] 196 | } 197 | 198 | // post-processor "amazon-ami" { 199 | // region = var.aws_region 200 | // ami_users = ["all"] 201 | // only = ["source.amazon-ebs.jammy"] 202 | // } 203 | 204 | // post-processor "shell-local" { 205 | // inline = [ 206 | // "gcloud compute images add-iam-policy-binding ${build.ImageName} --member='allAuthenticatedUsers' --role='roles/compute.imageUser'", 207 | // ] 208 | // only = ["source.googlecompute.ubuntu"] 209 | // } 210 | 211 | // post-processor "shell-local" { 212 | // only = ["source.qemu.jammy"] 213 | // inline = [ 214 | // "echo \"copying artifacts to local latest directory...\"", 215 | // "mkdir -p \"${BASE_DIR}/latest\"", 216 | // "cp \"${OUTPUT_DIRECTORY}/packer-jammy\" \"${BASE_DIR}/latest/jammy.qcow2\"", 217 | // "echo 'copying artifacts to GCS...'", 218 | // "TIMESTAMP=$(date +%Y%m%d%H%M%S)", 219 | // "OUTPUT_DIR='output-ubuntu'", 220 | // // Commands for copying artifacts to GCS commented out for clarity 221 | // "gsutil cp \"gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2\" \"gs://agentsea-vms/jammy/${TIMESTAMP}/agentd-jammy.qcow2\"", 222 | // "gsutil acl ch -u AllUsers:R \"gs://agentsea-vms/jammy/${TIMESTAMP}/agentd-jammy.qcow2\"", 223 | // ] 224 | // } 225 | } 226 | -------------------------------------------------------------------------------- /server.pkr.hcl: -------------------------------------------------------------------------------- 1 | packer { 2 | required_plugins { 3 | googlecompute = { 4 | source = "github.com/hashicorp/googlecompute" 5 | version = "~> 1" 6 | } 7 | } 8 | } 9 | 10 | packer { 11 | required_plugins { 12 | amazon = { 13 | source = "github.com/hashicorp/amazon" 14 | version = "~> 1" 15 | } 16 | } 17 | } 18 | variable "build_qemu" { 19 | type = bool 20 | default = true 21 | } 22 | 23 | variable "build_ec2" { 24 | type = bool 25 | default = true 26 | } 27 | 28 | variable "build_gce" { 29 | type = bool 30 | default = true 31 | } 32 | 33 | variable "gcp_project_id" { 34 | type = string 35 | default = "your-gcp-project-id" 36 | } 37 | 38 | variable "aws_region" { 39 | type = string 40 | default = "your-aws-region" 41 | } 42 | 43 | variable "output_directory" { 44 | type = string 45 | default = "output-ubuntu" 46 | } 47 | 48 | variable "cpu" { 49 | type = string 50 | default = "2" 51 | } 52 | 53 | variable "disk_size" { 54 | type = string 55 | default = "40000" 56 | } 57 | 58 | variable "headless" { 59 | type = string 60 | default = "true" 61 | } 62 | 63 | variable "iso_checksum" { 64 | type = string 65 | default = "d277aaac7a56ec02ea026a02d92fde2fc358048431749cb1031b62380cc93584" 66 | } 67 | 68 | variable "iso_url" { 69 | type = string 70 | default = "https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img" 71 | } 72 | 73 | variable "name" { 74 | type = string 75 | default = "jammy" 76 | } 77 | 78 | variable "ram" { 79 | type = string 80 | default = "2048" 81 | } 82 | 83 | variable "ssh_password" { 84 | type = string 85 | default = "ubuntu" 86 | } 87 | 88 | variable "ssh_username" { 89 | type = string 90 | default = "ubuntu" 91 | } 92 | 93 | variable "version" { 94 | type = string 95 | default = "" 96 | } 97 | 98 | variable "format" { 99 | type = string 100 | default = "qcow2" 101 | } 102 | 103 | // source "qemu" "jammy" { 104 | // # accelerator = "kvm" 105 | // boot_command = [] 106 | // disk_compression = true 107 | // disk_interface = "virtio" 108 | // disk_image = true 109 | // disk_size = var.disk_size 110 | // format = var.format 111 | // headless = var.headless 112 | // iso_checksum = var.iso_checksum 113 | // iso_url = var.iso_url 114 | // net_device = "virtio-net" 115 | // output_directory = "${var.output_directory}" 116 | // qemuargs = [ 117 | // ["-m", "${var.ram}M"], 118 | // ["-smp", "${var.cpu}"], 119 | // ["-cdrom", "cidata_root.iso"] 120 | // ] 121 | // communicator = "ssh" 122 | // shutdown_command = "echo '${var.ssh_password}' | sudo -S shutdown -P now" 123 | // ssh_password = var.ssh_password 124 | // ssh_username = var.ssh_username 125 | // ssh_timeout = "10m" 126 | // } 127 | 128 | // source "amazon-ebs" "jammy" { 129 | // ami_name = "agentd-ubuntu-22.04-${formatdate("YYYYMMDDHHmmss", timestamp())}" 130 | // instance_type = "t2.micro" 131 | // region = var.aws_region 132 | // source_ami_filter { 133 | // filters = { 134 | // name = "ubuntu/images/*ubuntu-jammy-22.04-amd64-server-*" 135 | // root-device-type = "ebs" 136 | // virtualization-type = "hvm" 137 | // } 138 | // owners = ["099720109477"] # Ubuntu's owner ID 139 | // most_recent = true 140 | // } 141 | // ssh_username = "ubuntu" 142 | // } 143 | 144 | source "googlecompute" "ubuntu" { 145 | project_id = var.gcp_project_id 146 | source_image = "agentd-ubuntu-22-04-u20240530022848" 147 | zone = "us-central1-a" 148 | ssh_username = "ubuntu" 149 | image_name = "agentd-ubuntu-22-04-u${formatdate("YYYYMMDDHHmmss", timestamp())}" 150 | } 151 | 152 | build { 153 | // dynamic "source" { 154 | // for_each = var.build_qemu ? ["source.qemu.jammy"] : [] 155 | // content { 156 | // source = source.value 157 | // } 158 | // } 159 | 160 | // dynamic "source" { 161 | // for_each = var.build_ec2 ? ["source.amazon-ebs.jammy"] : [] 162 | // content { 163 | // source = source.value 164 | // } 165 | // } 166 | 167 | // dynamic "source" { 168 | // for_each = var.build_gce ? ["source.googlecompute.ubuntu"] : [] 169 | // content { 170 | // source = source.value 171 | // } 172 | // } 173 | 174 | sources = [ 175 | // "source.qemu.jammy", 176 | // "source.amazon-ebs.jammy", 177 | "source.googlecompute.ubuntu", 178 | ] 179 | 180 | 181 | provisioner "shell" { 182 | inline = [ 183 | # Run install script 184 | "curl -sSL https://raw.githubusercontent.com/agentsea/agentd/main/remote_install_server.sh | sudo bash", 185 | 186 | # Prepare cloud-init to run on next boot for the QEMU image 187 | "sudo cloud-init clean --logs", 188 | "sudo truncate -s 0 /etc/machine-id", 189 | "sudo rm /var/lib/dbus/machine-id", 190 | "sudo ln -s /etc/machine-id /var/lib/dbus/machine-id", 191 | 192 | # Disable SSH password authentication 193 | "sudo sed -i 's/^#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config", 194 | "sudo sed -i 's/^PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config", 195 | "sudo systemctl restart sshd", 196 | ] 197 | } 198 | // post-processor "amazon-ami" { 199 | // region = var.aws_region 200 | // ami_users = ["all"] 201 | // only = ["source.amazon-ebs.jammy"] 202 | // } 203 | 204 | // post-processor "shell-local" { 205 | // inline = [ 206 | // "gcloud compute images add-iam-policy-binding ${build.ImageName} --member='allAuthenticatedUsers' --role='roles/compute.imageUser'", 207 | // ] 208 | // only = ["source.googlecompute.ubuntu"] 209 | // } 210 | 211 | // post-processor "shell-local" { 212 | // only = ["source.qemu.jammy"] 213 | // inline = [ 214 | // "echo \"copying artifacts to local latest directory...\"", 215 | // "mkdir -p \"${BASE_DIR}/latest\"", 216 | // "cp \"${OUTPUT_DIRECTORY}/packer-jammy\" \"${BASE_DIR}/latest/jammy.qcow2\"", 217 | // "echo 'copying artifacts to GCS...'", 218 | // "TIMESTAMP=$(date +%Y%m%d%H%M%S)", 219 | // "OUTPUT_DIR='output-ubuntu'", 220 | // // Commands for copying artifacts to GCS commented out for clarity 221 | // "gsutil cp \"gs://agentsea-vms/jammy/latest/agentd-jammy.qcow2\" \"gs://agentsea-vms/jammy/${TIMESTAMP}/agentd-jammy.qcow2\"", 222 | // "gsutil acl ch -u AllUsers:R \"gs://agentsea-vms/jammy/${TIMESTAMP}/agentd-jammy.qcow2\"", 223 | // ] 224 | // } 225 | } 226 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 |

4 | 7 | 8 |

agentd

9 | 10 |

11 | A daemon that makes a desktop OS accessible to AI agents. 12 |
13 | Explore the docs » 14 |
15 |
16 | View Demo 17 | · 18 | Report Bug 19 | · 20 | Request Feature 21 |

22 |
23 |

24 | 25 | `AgentD` makes a desktop OS accessible to AI agents by exposing an HTTP API. 26 | 27 | For a higher level interface see [AgentDesk](https://github.com/agentsea/agentdesk). 28 | 29 | ## Usage 30 | 31 | `AgentD` is currently tested on Ubuntu 22.04 cloud image. 32 | 33 | We recommend using one of our base vms which is already configured. 34 | 35 | ### Qemu 36 | 37 | For Qemu, download the qcow2 image: 38 | ```bash 39 | wget https://storage.googleapis.com/agentsea-vms/jammy/latest/agentd-jammy.qcow2 40 | ``` 41 | 42 | To use the image, we need to make a [cloud-init](https://cloud-init.io/) iso with our user-data. See this [tutorial](https://cloudinit.readthedocs.io/en/latest/reference/datasources/nocloud.html), below is how it looks on MacOS: 43 | 44 | ```bash 45 | xorriso -as mkisofs -o cidata.iso -V "cidata" -J -r -iso-level 3 meta/ 46 | ``` 47 | Then the image can be ran with Qemu: 48 | 49 | ```bash 50 | qemu-system-x86_64 -nographic -hda ./agentd-jammy.qcow2 \ 51 | -m 4G -smp 2 -netdev user,id=vmnet,hostfwd=tcp::6080-:6080,hostfwd=tcp::8000-:8000,hostfwd=tcp::2222-:22 \ 52 | -device e1000,netdev=vmnet -cdrom cidata.iso 53 | ``` 54 | Once running, the agentd service can be accessed: 55 | 56 | ```bash 57 | curl localhost:8000/health 58 | ``` 59 | To login to the machine: 60 | 61 | ```bash 62 | ssh -p 2222 agentsea@localhost 63 | ``` 64 | 65 | ### AWS 66 | For AWS, use public AMI `ami-01a893c1530453073`. 67 | 68 | Create a cloud-init script with your ssh key: 69 | 70 | ```yaml 71 | #cloud-config 72 | 73 | users: 74 | - name: agentsea 75 | sudo: ['ALL=(ALL) NOPASSWD:ALL'] 76 | groups: sudo 77 | ssh_authorized_keys: 78 | - your-ssh-public-key 79 | 80 | package_upgrade: true 81 | ``` 82 | 83 | ```bash 84 | aws ec2 run-instances \ 85 | --image-id ami-01a893c1530453073 \ 86 | --count 1 \ 87 | --instance-type t2.micro \ 88 | --key-name $KEY_NAME \ 89 | --security-group-ids $SG_NAME \ 90 | --subnet-id $SUBNET_NAME \ 91 | --user-data file://path/to/cloud-init-config.yaml 92 | ``` 93 | 94 | ### GCE 95 | 96 | For GCE, use the public image `ubuntu-22-04-20240208044623`. 97 | 98 | ```bash 99 | gcloud compute instances create $NAME \ 100 | --machine-type "n1-standard-1" \ 101 | --image "ubuntu-22-04-20240208044623" \ 102 | --image-project $PROJECT_ID \ 103 | --zone $ZONE \ 104 | --metadata ssh-keys="agentsea:$(cat path/to/your/public/ssh/key.pub)" 105 | ``` 106 | 107 | ### Custom 108 | 109 | If you want to install on a fresh Ubuntu VM, use the a [cloud images base](https://cloud-images.ubuntu.com/jammy/current/) qcow2 image. 110 | 111 | ```bash 112 | curl -sSL https://raw.githubusercontent.com/agentsea/agentd/main/remote_install.sh | sudo bash 113 | ``` 114 | 115 | ## API Endpoints 116 | 117 | ### General 118 | 119 | - **GET /health** - Checks the API's health. 120 | - **Response:** `{"status": "ok"}` 121 | 122 | ### Mouse and Keyboard Control 123 | 124 | - **GET /mouse_coordinates** - Retrieves the current mouse coordinates. 125 | 126 | - **Response Model:** `CoordinatesModel` 127 | 128 | - **POST /move_mouse** - Moves the mouse to specified coordinates. 129 | 130 | - **Request Body:** `MoveMouseModel` 131 | - **Response:** `{"status": "success"}` or `{"status": "error", "message": ""}` 132 | 133 | - **POST /click** - Clicks at the current or specified location. 134 | 135 | - **Request Body:** `ClickModel` 136 | - **Response:** `{"status": "success"}` or raises `HTTPException` 137 | 138 | - **POST /double_click** - Performs a double-click at the current mouse location. 139 | 140 | - **Response:** `{"status": "success"}` or raises `HTTPException` 141 | 142 | - **POST /type_text** - Types the specified text. 143 | 144 | - **Request Body:** `TypeTextModel` 145 | - **Response:** `{"status": "success"}` or raises `HTTPException` 146 | 147 | - **POST /press_key** - Presses a specified key. 148 | 149 | - **Request Body:** `PressKeyModel` 150 | - **Response:** `{"status": "success"}` or raises `HTTPException` 151 | 152 | - **POST /scroll** - Scrolls the mouse wheel. 153 | 154 | - **Request Body:** `ScrollModel` 155 | - **Response:** `{"status": "success"}` or raises `HTTPException` 156 | 157 | - **POST /drag_mouse** - Drags the mouse to specified coordinates. 158 | - **Request Body:** `DragMouseModel` 159 | - **Response:** `{"status": "success"}` or raises `HTTPException` 160 | 161 | ### Web Browser Control 162 | 163 | - **POST /open_url** - Opens a URL in a Chromium-based browser. 164 | - **Request Body:** `OpenURLModel` 165 | - **Response:** `{"status": "success"}` or `{"status": "error", "message": ""}` 166 | 167 | ### Screen Capture 168 | 169 | - **POST /screenshot** - Takes a screenshot and returns it as a base64-encoded image. 170 | - **Response Model:** `ScreenshotResponseModel` 171 | 172 | ### Session Recording 173 | 174 | - **POST /recordings** - Starts a new recording session. 175 | 176 | - **Request Body:** `RecordRequest` 177 | - **Response Model:** `RecordResponse` 178 | 179 | - **GET /recordings** - Lists all recordings. 180 | 181 | - **Response Model:** `Recordings` 182 | 183 | - **POST /recordings/{session_id}/stop** - Stops a recording session. 184 | 185 | - **Path Variable:** `session_id` 186 | - **Response:** None (side effect: stops recording and saves to file) 187 | 188 | - **GET /recordings/{session_id}** - Retrieves information about a specific recording session. 189 | 190 | - **Path Variable:** `session_id` 191 | - **Response Model:** `Recording` 192 | 193 | - **GET /recordings/{session_id}/event/{event_id}** - Retrieves a specific event from a recording. 194 | 195 | - **Path Variables:** `session_id`, `event_id` 196 | - **Response Model:** `RecordedEvent` 197 | 198 | - **DELETE /recordings/{session_id}/event/{event_id}** - Deletes a specific event from a recording. 199 | 200 | - **Path Variables:** `session_id`, `event_id` 201 | - **Response Model:** `Recording` 202 | 203 | - **GET /active_sessions** - Lists IDs of all active recording sessions. 204 | 205 | - **Response Model:** `Recordings` 206 | 207 | - **GET /recordings/{session_id}/actions** - Retrieves all actions from a specific recording session. 208 | - **Path Variable:** `session_id` 209 | - **Response Model:** `Actions` 210 | 211 | ## Community 212 | 213 | Come join us on [Discord](https://discord.gg/hhaq7XYPS6). 214 | 215 | ## Developing 216 | 217 | To pack a fresh set of images 218 | 219 | ```bash 220 | make pack 221 | ``` 222 |   223 | To run from this repo 224 | 225 | ```bash 226 | make run-jammy 227 | ``` 228 | -------------------------------------------------------------------------------- /docs/recordings.rst: -------------------------------------------------------------------------------- 1 | Making Recordings 2 | ================== 3 | 4 | POST /recordings 5 | ^^^^^^^^^^^^^^^^ 6 | 7 | The ``/recordings`` endpoint starts a new recording session. 8 | 9 | **Request:** 10 | 11 | .. code-block:: json 12 | 13 | { 14 | "description": "string" 15 | } 16 | 17 | **Response:** 18 | 19 | Returns a JSON response containing the session ID of the newly started recording session. 20 | 21 | .. code-block:: json 22 | 23 | { 24 | "session_id": "uuid" 25 | } 26 | 27 | GET /recordings 28 | ^^^^^^^^^^^^^^^ 29 | 30 | The ``/recordings`` endpoint retrieves a list of all recording sessions. 31 | 32 | **Request:** 33 | 34 | No parameters required. 35 | 36 | **Response:** 37 | 38 | Returns a JSON response containing a list of recording session IDs. 39 | 40 | .. code-block:: json 41 | 42 | { 43 | "recordings": [ 44 | "uuid1", 45 | "uuid2", 46 | "uuid3" 47 | ] 48 | } 49 | 50 | This endpoint allows you to retrieve all the recording sessions that have been initiated. 51 | 52 | POST /recordings/{session_id}/stop 53 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 54 | 55 | The endpoint to stop a recording session. 56 | 57 | **Request:** 58 | 59 | Path Parameters: 60 | - ``session_id``: The unique identifier of the recording session to be stopped. 61 | 62 | **Response:** 63 | 64 | Returns a JSON response indicating the success of the operation. 65 | 66 | GET /recordings/{session_id} 67 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 68 | 69 | The endpoint to retrieve a specific recording session by its session ID. 70 | 71 | **Request:** 72 | 73 | Path Parameters: 74 | - ``session_id``: The unique identifier of the recording session to be retrieved. 75 | 76 | **Response:** 77 | 78 | Returns a JSON response containing the details of the specified recording session, including the session ID, description, start time, end time, and a list of recorded events. 79 | 80 | .. code-block:: json 81 | 82 | { 83 | "id": "uuid", 84 | "description": "Session Description", 85 | "start_time": 1622547600, 86 | "end_time": 1622547900, 87 | "events": [ 88 | { 89 | "id": "uuid", 90 | "type": "click", 91 | "timestamp": 1622547605, 92 | "coordinates": { 93 | "x": 100, 94 | "y": 200 95 | }, 96 | "screenshot_path": "path/to/screenshot", 97 | "click_data": { 98 | "button": "left", 99 | "pressed": true 100 | } 101 | }, 102 | { 103 | "id": "uuid", 104 | "type": "key", 105 | "timestamp": 1622547610, 106 | "key_data": { 107 | "key": "a" 108 | } 109 | } 110 | ] 111 | } 112 | 113 | This endpoint allows you to retrieve detailed information about a specific recording session, including all the events that occurred during the session. 114 | 115 | GET /recordings/{session_id}/event/{event_id} 116 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 117 | 118 | The endpoint to retrieve a specific event from a recording session by its session ID and event ID. 119 | 120 | **Request:** 121 | 122 | Path Parameters: 123 | - ``session_id``: The unique identifier of the recording session. 124 | - ``event_id``: The unique identifier of the event within the recording session. 125 | 126 | **Response:** 127 | 128 | Returns a JSON response containing the details of the specified event, including the event ID, type, timestamp, coordinates, and any associated data such as click data, key data, scroll data, or text data. 129 | 130 | .. code-block:: json 131 | 132 | { 133 | "id": "uuid", 134 | "type": "click", 135 | "timestamp": 1622547605, 136 | "coordinates": { 137 | "x": 100, 138 | "y": 200 139 | }, 140 | "screenshot_path": "path/to/screenshot", 141 | "click_data": { 142 | "button": "left", 143 | "pressed": true 144 | } 145 | } 146 | 147 | This endpoint allows you to retrieve detailed information about a specific event within a recording session. 148 | 149 | DELETE /recordings/{session_id}/event/{event_id} 150 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 151 | 152 | The endpoint to delete a specific event from a recording session by its session ID and event ID. 153 | 154 | **Request:** 155 | 156 | Path Parameters: 157 | - ``session_id``: The unique identifier of the recording session. 158 | - ``event_id``: The unique identifier of the event within the recording session. 159 | 160 | **Response:** 161 | 162 | Returns a JSON response containing the updated recording session details without the deleted event. 163 | 164 | .. code-block:: json 165 | 166 | { 167 | "id": "session_uuid", 168 | "description": "Session Description", 169 | "start_time": 1622547600, 170 | "end_time": 1622547615, 171 | "events": [ 172 | { 173 | "id": "uuid", 174 | "type": "click", 175 | "timestamp": 1622547605, 176 | "coordinates": { 177 | "x": 100, 178 | "y": 200 179 | }, 180 | "screenshot_path": "path/to/screenshot", 181 | "click_data": { 182 | "button": "left", 183 | "pressed": true 184 | } 185 | } 186 | // Other events 187 | ] 188 | } 189 | 190 | This endpoint allows you to delete a specific event from a recording session. 191 | 192 | GET /active_sessions 193 | ^^^^^^^^^^^^^^^^^^^^ 194 | 195 | This endpoint lists all active recording sessions. 196 | 197 | **Response:** 198 | 199 | Returns a JSON response containing a list of session IDs for all active recording sessions. 200 | 201 | .. code-block:: json 202 | 203 | { 204 | "recordings": [ 205 | "session_id_1", 206 | "session_id_2", 207 | // Other session IDs 208 | ] 209 | } 210 | 211 | This endpoint allows you to retrieve a list of all active recording sessions. 212 | 213 | GET /recordings/{session_id}/actions 214 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 215 | 216 | This endpoint retrieves a list of actions for a specific recording session. 217 | 218 | **Parameters:** 219 | 220 | - ``session_id``: The unique identifier for the recording session. 221 | 222 | **Response:** 223 | 224 | Returns a JSON response containing a list of actions for the specified recording session. 225 | 226 | .. code-block:: json 227 | 228 | { 229 | "actions": [ 230 | { 231 | "id": "action_uuid", 232 | "type": "click", 233 | "timestamp": 1622547605, 234 | "details": { 235 | "coordinates": { 236 | "x": 100, 237 | "y": 200 238 | }, 239 | "button": "left", 240 | "pressed": true 241 | } 242 | }, 243 | { 244 | "id": "action_uuid", 245 | "type": "keypress", 246 | "timestamp": 1622547610, 247 | "details": { 248 | "key": "space" 249 | } 250 | } 251 | // Other actions 252 | ] 253 | } 254 | 255 | This endpoint allows you to retrieve a list of all actions (clicks, keypresses, etc.) that occurred during a specific recording session. 256 | 257 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | from httpx import AsyncClient 2 | from unittest.mock import patch 3 | import pytest 4 | from agentd.server import app 5 | from agentd.recording import RecordingSession 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_root(): 10 | async with AsyncClient(app=app, base_url="http://test") as ac: 11 | response = await ac.get("/") 12 | assert response.status_code == 200 13 | assert response.json() == {"message": "Agent in the shell"} 14 | 15 | 16 | @pytest.mark.asyncio 17 | async def test_health(): 18 | async with AsyncClient(app=app, base_url="http://test") as ac: 19 | response = await ac.get("/health") 20 | assert response.status_code == 200 21 | assert response.json() == {"status": "ok"} 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_info(): 26 | async with AsyncClient(app=app, base_url="http://test") as ac: 27 | response = await ac.get("/info") 28 | assert response.status_code == 200 29 | assert "last_activity_ts" in response.json() 30 | assert "screen_size" in response.json() 31 | assert "os_info" in response.json() 32 | assert "code_version" in response.json() 33 | 34 | 35 | @pytest.mark.asyncio 36 | async def test_screen_size(): 37 | async with AsyncClient(app=app, base_url="http://test") as ac: 38 | response = await ac.get("/screen_size") 39 | assert response.status_code == 200 40 | assert "x" in response.json() 41 | assert "y" in response.json() 42 | 43 | 44 | @pytest.mark.asyncio 45 | async def test_mouse_coordinates(): 46 | async with AsyncClient(app=app, base_url="http://test") as ac: 47 | response = await ac.get("/mouse_coordinates") 48 | assert response.status_code == 200 49 | assert "x" in response.json() 50 | assert "y" in response.json() 51 | 52 | 53 | @pytest.mark.asyncio 54 | async def test_system_usage(): 55 | async with AsyncClient(app=app, base_url="http://test") as ac: 56 | response = await ac.get("/system_usage") 57 | assert response.status_code == 200 58 | assert "cpu_percent" in response.json() 59 | assert "memory_percent" in response.json() 60 | assert "disk_percent" in response.json() 61 | 62 | 63 | @pytest.mark.asyncio 64 | async def test_open_url(): 65 | with patch("agentd.server.is_chromium_running", return_value=False), patch( 66 | "agentd.server.gracefully_terminate_chromium" 67 | ) as mock_terminate, patch( 68 | "agentd.server.is_chromium_window_open", return_value=True 69 | ), patch( 70 | "agentd.server.subprocess.Popen" 71 | ) as mock_popen: 72 | async with AsyncClient(app=app, base_url="http://test") as ac: 73 | response = await ac.post("/open_url", json={"url": "http://example.com"}) 74 | 75 | assert response.status_code == 200 76 | assert response.json() == {"status": "success"} 77 | mock_terminate.assert_not_called() 78 | mock_popen.assert_called_once() 79 | 80 | 81 | @pytest.mark.asyncio 82 | async def test_move_mouse(): 83 | async with AsyncClient(app=app, base_url="http://test") as ac: 84 | response = await ac.post( 85 | "/move_mouse", json={"x": 100, "y": 200, "duration": 1.0, "tween": "linear"} 86 | ) 87 | assert response.status_code == 200 88 | assert response.json() == {"status": "success"} 89 | 90 | 91 | @pytest.mark.asyncio 92 | async def test_click(): 93 | async with AsyncClient(app=app, base_url="http://test") as ac: 94 | response = await ac.post("/click", json={"button": "left"}) 95 | assert response.status_code == 200 96 | assert response.json() == {"status": "success"} 97 | 98 | 99 | @pytest.mark.asyncio 100 | async def test_double_click(): 101 | async with AsyncClient(app=app, base_url="http://test") as ac: 102 | response = await ac.post("/double_click") 103 | assert response.status_code == 200 104 | assert response.json() == {"status": "success"} 105 | 106 | 107 | @pytest.mark.asyncio 108 | async def test_type_text(): 109 | async with AsyncClient(app=app, base_url="http://test") as ac: 110 | response = await ac.post( 111 | "/type_text", 112 | json={"text": "hello", "min_interval": 0.05, "max_interval": 0.25}, 113 | ) 114 | assert response.status_code == 200 115 | assert response.json() == {"status": "success"} 116 | 117 | 118 | @pytest.mark.asyncio 119 | async def test_press_key(): 120 | async with AsyncClient(app=app, base_url="http://test") as ac: 121 | response = await ac.post("/press_key", json={"key": "enter"}) 122 | assert response.status_code == 200 123 | assert response.json() == {"status": "success"} 124 | 125 | 126 | @pytest.mark.asyncio 127 | async def test_scroll(): 128 | async with AsyncClient(app=app, base_url="http://test") as ac: 129 | response = await ac.post("/scroll", json={"clicks": 3}) 130 | assert response.status_code == 200 131 | assert response.json() == {"status": "success"} 132 | 133 | 134 | @pytest.mark.asyncio 135 | async def test_drag_mouse(): 136 | with patch("agentd.server.pyautogui.dragTo") as mock_dragTo: 137 | async with AsyncClient(app=app, base_url="http://test") as ac: 138 | response = await ac.post("/drag_mouse", json={"x": 300, "y": 400}) 139 | 140 | assert response.status_code == 200 141 | assert response.json() == {"status": "success"} 142 | mock_dragTo.assert_called_once_with(300, 400) 143 | 144 | 145 | @pytest.mark.asyncio 146 | async def test_take_screenshot(): 147 | async with AsyncClient(app=app, base_url="http://test") as ac: 148 | response = await ac.post("/screenshot") 149 | assert response.status_code == 200 150 | assert "status" in response.json() 151 | assert response.json()["status"] == "success" 152 | assert "image" in response.json() 153 | assert "file_path" in response.json() 154 | 155 | 156 | @pytest.fixture 157 | def mocker(): 158 | from unittest.mock import MagicMock 159 | 160 | return MagicMock() 161 | 162 | 163 | @pytest.mark.asyncio 164 | async def test_recording_workflow(mocker): 165 | async with AsyncClient(app=app, base_url="http://test") as ac: 166 | 167 | # Test start recording 168 | description = "Test recording" 169 | response_start = await ac.post("/recordings", json={"description": description}) 170 | assert response_start.status_code == 200 171 | assert "session_id" in response_start.json() 172 | session_id = response_start.json()["session_id"] 173 | 174 | # Test list recordings 175 | mocker.patch("agentd.server.list_recordings", return_value={"recordings": []}) 176 | response_list = await ac.get("/recordings") 177 | assert response_list.status_code == 200 178 | recordings_list = response_list.json()["recordings"] 179 | assert session_id in recordings_list 180 | 181 | # Test stop recording 182 | mocker.patch( 183 | "agentd.server.sessions.get", 184 | return_value=RecordingSession(session_id, "Test"), 185 | ) 186 | mocker.patch("agentd.server.RecordingSession.stop", return_value=None) 187 | mocker.patch( 188 | "agentd.server.RecordingSession.save_to_file", return_value="path/to/file" 189 | ) 190 | response_stop = await ac.post(f"/recordings/{session_id}/stop") 191 | assert response_stop.status_code == 200 192 | 193 | # Test get recording 194 | response_get = await ac.get(f"/recordings/{session_id}") 195 | assert response_get.status_code == 200 196 | assert "id" in response_get.json() 197 | assert "end_time" in response_get.json() 198 | 199 | # Test delete event 200 | event_id = "test_event" 201 | session = RecordingSession(session_id, "Test") 202 | mocker.patch("agentd.server.sessions.get", return_value=session) 203 | mocker.patch("agentd.server.RecordingSession.delete_event", return_value=None) 204 | response_delete_event = await ac.delete( 205 | f"/recordings/{session_id}/event/{event_id}" 206 | ) 207 | assert response_delete_event.status_code == 200 208 | assert "id" in response_delete_event.json() 209 | 210 | # Test get actions 211 | mocker.patch( 212 | "agentd.server.sessions.get", 213 | return_value=RecordingSession(session_id, "Test"), 214 | ) 215 | response_get_actions = await ac.get(f"/recordings/{session_id}/actions") 216 | assert response_get_actions.status_code == 200 217 | assert "actions" in response_get_actions.json() 218 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=$TARGETPLATFORM lscr.io/linuxserver/webtop:latest@sha256:41109089fcf80d45b25e6e3d0d8a9ae9bd13568af2d020266e55c7159fc9f2eb 2 | 3 | RUN uname -m 4 | RUN cat /etc/alpine-release 5 | 6 | # Install necessary build tools and libraries 7 | RUN echo "http://dl-cdn.alpinelinux.org/alpine/v3.20/community" >> /etc/apk/repositories && \ 8 | apk update && \ 9 | apk add --no-cache \ 10 | build-base \ 11 | libffi-dev \ 12 | openssl-dev \ 13 | zlib-dev \ 14 | bzip2-dev \ 15 | readline-dev \ 16 | sqlite-dev \ 17 | ncurses-dev \ 18 | xz-dev \ 19 | bash \ 20 | tk-dev \ 21 | gdbm-dev \ 22 | db-dev \ 23 | libpcap-dev \ 24 | linux-headers \ 25 | curl \ 26 | git \ 27 | wget \ 28 | scrot \ 29 | xrandr \ 30 | libx11 \ 31 | libxext \ 32 | libxcb \ 33 | xauth \ 34 | xwd \ 35 | imagemagick \ 36 | procps \ 37 | xdotool \ 38 | speech-dispatcher \ 39 | xclip \ 40 | gtk-murrine-engine \ 41 | sassc \ 42 | rsync \ 43 | bc \ 44 | optipng \ 45 | zip \ 46 | unzip \ 47 | xmlstarlet \ 48 | coreutils \ 49 | glib-dev \ 50 | libxml2-utils \ 51 | mesa-gl \ 52 | redis 53 | 54 | # RUN echo $USER 55 | RUN pwd 56 | RUN echo $HOME 57 | RUN echo $USER 58 | RUN echo $LOGNAME 59 | RUN echo $SHELL 60 | 61 | RUN which readlink && readlink --version 62 | 63 | RUN mkdir -p /config/.themes /config/.icons /config/.wallpapers /config/.local /config/.config/gtk-3.0 /config/.config/glib-2.0 && \ 64 | chown -R abc:abc /config/.themes /config/.icons /config/.wallpapers /config/.local /config/.config/gtk-3.0 /config/.config/glib-2.0 65 | 66 | # Set environment variables for Python installation 67 | ENV PYTHON_VERSION=3.12.1 68 | ENV PYENV_ROOT="/config/.pyenv" 69 | ENV PATH="$PYENV_ROOT/bin:$PATH" 70 | 71 | # Install pyenv as root 72 | RUN curl https://pyenv.run | bash 73 | 74 | # Change ownership of pyenv directories to user 'abc' 75 | RUN chown -R abc:abc /config/.pyenv 76 | 77 | # Create the application directory and set ownership to 'abc' 78 | RUN mkdir -p /config/app && chown -R abc:abc /config/app 79 | 80 | # Ensure the cache directory exists and is owned by 'abc' 81 | RUN mkdir -p /config/app/.cache && chown -R abc:abc /config/app/.cache 82 | 83 | # Switch to non-root user 'abc' 84 | USER abc 85 | 86 | # Create a shell script for environment setup 87 | RUN echo 'export PYENV_ROOT="/config/.pyenv"' > /config/app/pyenv_setup.sh && \ 88 | echo 'export PATH="$PYENV_ROOT/bin:$PYENV_ROOT/shims:$PATH"' >> /config/app/pyenv_setup.sh && \ 89 | echo 'eval "$(pyenv init --path)"' >> /config/app/pyenv_setup.sh && \ 90 | echo 'eval "$(pyenv init -)"' >> /config/app/pyenv_setup.sh && \ 91 | chmod +x /config/app/pyenv_setup.sh 92 | 93 | # Set working directory to '/config/app' 94 | WORKDIR /config/app 95 | 96 | # Copy project files (only pyproject.toml and poetry.lock to leverage caching) 97 | COPY --chown=abc:abc pyproject.toml README.md poetry.lock /config/app/ 98 | 99 | # Install Python using pyenv as 'abc' by sourcing the setup script 100 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 101 | "source /config/app/pyenv_setup.sh && pyenv install ${PYTHON_VERSION}" || \ 102 | { echo "Build failed. Showing config.log:"; cat /tmp/python-build.*/Python-*/config.log; exit 1; } 103 | 104 | # Set the global Python version 105 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 106 | "source /config/app/pyenv_setup.sh && pyenv global ${PYTHON_VERSION}" 107 | 108 | # Switch to user 'abc' 109 | USER abc 110 | RUN env 111 | 112 | # Install WhiteSur Themes and Wallpapers 113 | RUN export HOME=/config USER=abc LOGNAME=abc SHELL=/bin/bash && \ 114 | \ 115 | # Install WhiteSur GTK Theme 116 | git clone https://github.com/vinceliuice/WhiteSur-gtk-theme.git --depth=1 /config/.themes/WhiteSur-gtk-theme && \ 117 | /bin/bash -ex /config/.themes/WhiteSur-gtk-theme/install.sh -d /config/.themes && \ 118 | rm -rf /config/.themes/WhiteSur-gtk-theme && \ 119 | \ 120 | # Install WhiteSur Icon Theme 121 | git clone https://github.com/vinceliuice/WhiteSur-icon-theme.git --depth=1 /config/.icons/WhiteSur-icon-theme && \ 122 | /bin/bash -ex /config/.icons/WhiteSur-icon-theme/install.sh -d /config/.icons && \ 123 | rm -rf /config/.icons/WhiteSur-icon-theme && \ 124 | \ 125 | # Install WhiteSur Wallpapers 126 | git clone https://github.com/vinceliuice/WhiteSur-wallpapers.git --depth=1 /config/.wallpapers/WhiteSur-wallpapers && \ 127 | /bin/bash -ex /config/.wallpapers/WhiteSur-wallpapers/install-wallpapers.sh -t monterey && \ 128 | rm -rf /config/.wallpapers/WhiteSur-wallpapers 129 | 130 | RUN chown -R abc:abc /config/.themes /config/.icons /config/.local /config/.wallpapers 131 | 132 | # Copy (and overwrite) the Xfce desktop XML (wallpaper settings) 133 | COPY --chown=abc:abc ./theme/xfce4-desktop.xml /config/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-desktop.xml 134 | 135 | # Copy in xsettings.xml to set GTK theme, icon theme, cursor, and fonts 136 | COPY --chown=abc:abc ./theme/xsettings.xml /config/.config/xfce4/xfconf/xfce-perchannel-xml/xsettings.xml 137 | 138 | # Copy in xfwm4.xml to set the window manager theme and titlebar font 139 | COPY --chown=abc:abc ./theme/xfwm4.xml /config/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml 140 | 141 | # Copy in enable-compositing.desktop to enable compositing 142 | COPY --chown=abc:abc ./theme/enable-compositing.desktop /config/.config/autostart/enable-compositing.desktop 143 | 144 | # TODO: ? 145 | # VOLUME /config 146 | 147 | # Ensure 'abc' owns the pyenv directory after installation 148 | USER root 149 | RUN chown -R abc:abc /config/.pyenv 150 | USER abc 151 | 152 | # Create a virtual environment using the installed Python version 153 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 154 | "source /config/app/pyenv_setup.sh && python -m venv /config/app/venv" 155 | 156 | # Update PATH to include the virtual environment's bin directory 157 | ENV PATH="/config/app/venv/bin:$PATH" 158 | 159 | # Set environment variable to prevent poetry from using keyring 160 | ENV POETRY_NO_KEYRING=1 161 | 162 | # Upgrade pip to the latest version 163 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 164 | "source /config/app/pyenv_setup.sh && \ 165 | source /config/app/venv/bin/activate && \ 166 | pip install --no-cache-dir --upgrade pip" 167 | 168 | # Install project dependencies using Poetry 169 | RUN XDG_CACHE_HOME=/config/app/.cache \ 170 | POETRY_CACHE_DIR=/config/app/.cache/pypoetry \ 171 | /bin/bash -c "source /config/app/pyenv_setup.sh && \ 172 | source /config/app/venv/bin/activate && \ 173 | pip install --no-cache-dir poetry && \ 174 | poetry install --no-root" 175 | 176 | # Copy the rest of your application code 177 | COPY --chown=abc:abc . /config/app/ 178 | 179 | # Create the logs and recordings directories and set ownership to 'abc' 180 | RUN mkdir -p /config/app/logs && chown -R abc:abc /config/app/logs 181 | RUN mkdir -p /config/app/recordings && chown -R abc:abc /config/app/recordings 182 | 183 | # # Switch back to root to set up the s6-overlay v3 service 184 | USER root 185 | 186 | ENV S6_LOGGING=1 187 | ENV S6_VERBOSITY=2 188 | ENV S6_KEEP_ENV=1 189 | ENV S6_RC_VERBOSE=1 190 | 191 | # Probably don't need, for compositing 192 | # COPY xconf_run /etc/s6-overlay/s6-rc.d/xconf/up 193 | # RUN echo 'oneshot' > /etc/s6-overlay/s6-rc.d/xconf/type 194 | # RUN ln -s ../xconf /etc/s6-overlay/s6-rc.d/user/contents.d/xconf 195 | # COPY ./theme/enable-compositing.desktop /etc/xdg/autostart/enable-compositing.desktop 196 | 197 | 198 | RUN touch /config/app/audit.log && chown abc:abc /config/app/audit.log && chmod 644 /config/app/audit.log 199 | RUN touch /config/app/logs/redis_env.log && chown abc:abc /config/app/logs/redis_env.log && chmod 644 /config/app/logs/redis_env.log 200 | 201 | RUN mkdir -p /config/app/logs/uvicorn && chown -R abc:abc /config/app/logs/uvicorn 202 | 203 | RUN mkdir -p /config/app/celery && chown -R abc:abc /config/app/celery && chmod 744 /config/app/celery 204 | RUN mkdir -p /config/.agentsea && chown -R abc:abc /config/.agentsea 205 | RUN mkdir -p /config/.agentsea/data && chown -R abc:abc /config/.agentsea/data 206 | 207 | # Create the s6-overlay v3 service directory for your application 208 | RUN mkdir -p /etc/s6-overlay/s6-rc.d/uvicorn 209 | 210 | # Create Redis service directory 211 | RUN mkdir -p /etc/s6-overlay/s6-rc.d/redis 212 | 213 | # Copy the s6-overlay v3 run script into the service directory 214 | COPY uvicorn_run /etc/s6-overlay/s6-rc.d/uvicorn/run 215 | 216 | # Copy the s6-overlay v3 run script into the service directory 217 | COPY redis_run /etc/s6-overlay/s6-rc.d/redis/run 218 | 219 | # Make the run script executable 220 | RUN chmod +x /etc/s6-overlay/s6-rc.d/uvicorn/run 221 | 222 | # Make the run script executable for redis 223 | RUN chmod +x /etc/s6-overlay/s6-rc.d/redis/run 224 | 225 | # Create the 'type' file for the service 226 | RUN echo 'longrun' > /etc/s6-overlay/s6-rc.d/uvicorn/type 227 | 228 | # Create the 'type' file for Redis service 229 | RUN echo 'longrun' > /etc/s6-overlay/s6-rc.d/redis/type 230 | 231 | # Enable the service by creating a symlink in the 'user' bundle 232 | RUN ln -s ../uvicorn /etc/s6-overlay/s6-rc.d/user/contents.d/uvicorn 233 | 234 | # Enable Redis service by creating a symlink in the 'user' bundle 235 | RUN ln -s ../redis /etc/s6-overlay/s6-rc.d/user/contents.d/redis 236 | 237 | RUN chown -R abc:abc /config/.agentsea/data 238 | 239 | COPY conf/kasm/run /etc/s6-overlay/s6-rc.d/svc-kasmvnc/run 240 | 241 | # Create the 'data' directory for the service and set the user 242 | # RUN mkdir -p /etc/s6-overlay/s6-rc.d/uvicorn/data && \ 243 | # echo 'abc' > /etc/s6-overlay/s6-rc.d/uvicorn/data/user 244 | 245 | RUN echo 'abc' > /etc/s6-overlay/s6-rc.d/uvicorn/user 246 | 247 | # Set the user for Redis service 248 | RUN echo 'abc' > /etc/s6-overlay/s6-rc.d/redis/user 249 | 250 | ENV AGENTSEA_HOME=/config/.agentsea 251 | 252 | # Expose the port uvicorn is running on (if needed) 253 | EXPOSE 8000 254 | 255 | # Expose Redis Port, we don't need to because it should only be used internally but this is there just incase 256 | # EXPOSE 6379 -------------------------------------------------------------------------------- /Dockerfile.loaded: -------------------------------------------------------------------------------- 1 | FROM --platform=$TARGETPLATFORM lscr.io/linuxserver/webtop:latest@sha256:41109089fcf80d45b25e6e3d0d8a9ae9bd13568af2d020266e55c7159fc9f2eb 2 | 3 | RUN uname -m 4 | RUN cat /etc/alpine-release 5 | 6 | # Install necessary build tools and libraries 7 | RUN echo "http://dl-cdn.alpinelinux.org/alpine/v3.20/community" >> /etc/apk/repositories && \ 8 | apk update && \ 9 | apk add --no-cache \ 10 | build-base \ 11 | libffi-dev \ 12 | openssl-dev \ 13 | zlib-dev \ 14 | bzip2-dev \ 15 | readline-dev \ 16 | sqlite-dev \ 17 | ncurses-dev \ 18 | xz-dev \ 19 | bash \ 20 | tk-dev \ 21 | gdbm-dev \ 22 | db-dev \ 23 | libpcap-dev \ 24 | linux-headers \ 25 | curl \ 26 | git \ 27 | wget \ 28 | scrot \ 29 | xrandr \ 30 | libx11 \ 31 | libxext \ 32 | libxcb \ 33 | xauth \ 34 | xwd \ 35 | imagemagick \ 36 | procps \ 37 | xdotool \ 38 | speech-dispatcher \ 39 | xclip \ 40 | gtk-murrine-engine \ 41 | sassc \ 42 | rsync \ 43 | bc \ 44 | optipng \ 45 | zip \ 46 | unzip \ 47 | xmlstarlet \ 48 | coreutils \ 49 | glib-dev \ 50 | libxml2-utils \ 51 | mesa-gl \ 52 | redis 53 | 54 | # RUN echo $USER 55 | RUN pwd 56 | RUN echo $HOME 57 | RUN echo $USER 58 | RUN echo $LOGNAME 59 | RUN echo $SHELL 60 | 61 | RUN which readlink && readlink --version 62 | 63 | RUN mkdir -p /config/.themes /config/.icons /config/.wallpapers /config/.local /config/.config/gtk-3.0 /config/.config/glib-2.0 && \ 64 | chown -R abc:abc /config/.themes /config/.icons /config/.wallpapers /config/.local /config/.config/gtk-3.0 /config/.config/glib-2.0 65 | 66 | # Set environment variables for Python installation 67 | ENV PYTHON_VERSION=3.12.1 68 | ENV PYENV_ROOT="/config/.pyenv" 69 | ENV PATH="$PYENV_ROOT/bin:$PATH" 70 | 71 | # Install pyenv as root 72 | RUN curl https://pyenv.run | bash 73 | 74 | # Change ownership of pyenv directories to user 'abc' 75 | RUN chown -R abc:abc /config/.pyenv 76 | 77 | # Create the application directory and set ownership to 'abc' 78 | RUN mkdir -p /config/app && chown -R abc:abc /config/app 79 | 80 | # Ensure the cache directory exists and is owned by 'abc' 81 | RUN mkdir -p /config/app/.cache && chown -R abc:abc /config/app/.cache 82 | 83 | # Switch to non-root user 'abc' 84 | USER abc 85 | 86 | # Create a shell script for environment setup 87 | RUN echo 'export PYENV_ROOT="/config/.pyenv"' > /config/app/pyenv_setup.sh && \ 88 | echo 'export PATH="$PYENV_ROOT/bin:$PYENV_ROOT/shims:$PATH"' >> /config/app/pyenv_setup.sh && \ 89 | echo 'eval "$(pyenv init --path)"' >> /config/app/pyenv_setup.sh && \ 90 | echo 'eval "$(pyenv init -)"' >> /config/app/pyenv_setup.sh && \ 91 | chmod +x /config/app/pyenv_setup.sh 92 | 93 | # Set working directory to '/config/app' 94 | WORKDIR /config/app 95 | 96 | # Copy project files (only pyproject.toml and poetry.lock to leverage caching) 97 | COPY --chown=abc:abc pyproject.toml README.md poetry.lock /config/app/ 98 | 99 | # Install Python using pyenv as 'abc' by sourcing the setup script 100 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 101 | "source /config/app/pyenv_setup.sh && pyenv install ${PYTHON_VERSION}" || \ 102 | { echo "Build failed. Showing config.log:"; cat /tmp/python-build.*/Python-*/config.log; exit 1; } 103 | 104 | # Set the global Python version 105 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 106 | "source /config/app/pyenv_setup.sh && pyenv global ${PYTHON_VERSION}" 107 | 108 | # Switch to user 'abc' 109 | USER abc 110 | RUN env 111 | 112 | # Install WhiteSur Themes and Wallpapers 113 | RUN export HOME=/config USER=abc LOGNAME=abc SHELL=/bin/bash && \ 114 | \ 115 | # Install WhiteSur GTK Theme 116 | git clone https://github.com/vinceliuice/WhiteSur-gtk-theme.git --depth=1 /config/.themes/WhiteSur-gtk-theme && \ 117 | /bin/bash -ex /config/.themes/WhiteSur-gtk-theme/install.sh -d /config/.themes && \ 118 | rm -rf /config/.themes/WhiteSur-gtk-theme && \ 119 | \ 120 | # Install WhiteSur Icon Theme 121 | git clone https://github.com/vinceliuice/WhiteSur-icon-theme.git --depth=1 /config/.icons/WhiteSur-icon-theme && \ 122 | /bin/bash -ex /config/.icons/WhiteSur-icon-theme/install.sh -d /config/.icons && \ 123 | rm -rf /config/.icons/WhiteSur-icon-theme && \ 124 | \ 125 | # Install WhiteSur Wallpapers 126 | git clone https://github.com/vinceliuice/WhiteSur-wallpapers.git --depth=1 /config/.wallpapers/WhiteSur-wallpapers && \ 127 | /bin/bash -ex /config/.wallpapers/WhiteSur-wallpapers/install-wallpapers.sh -t monterey && \ 128 | rm -rf /config/.wallpapers/WhiteSur-wallpapers 129 | 130 | RUN chown -R abc:abc /config/.themes /config/.icons /config/.local /config/.wallpapers 131 | 132 | # Copy (and overwrite) the Xfce desktop XML (wallpaper settings) 133 | COPY --chown=abc:abc ./theme/xfce4-desktop.xml /config/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-desktop.xml 134 | 135 | # Copy in xsettings.xml to set GTK theme, icon theme, cursor, and fonts 136 | COPY --chown=abc:abc ./theme/xsettings.xml /config/.config/xfce4/xfconf/xfce-perchannel-xml/xsettings.xml 137 | 138 | # Copy in xfwm4.xml to set the window manager theme and titlebar font 139 | COPY --chown=abc:abc ./theme/xfwm4.xml /config/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml 140 | 141 | # Copy in enable-compositing.desktop to enable compositing 142 | COPY --chown=abc:abc ./theme/enable-compositing.desktop /config/.config/autostart/enable-compositing.desktop 143 | 144 | # TODO: ? 145 | # VOLUME /config 146 | 147 | # Ensure 'abc' owns the pyenv directory after installation 148 | USER root 149 | RUN chown -R abc:abc /config/.pyenv 150 | USER abc 151 | 152 | # Create a virtual environment using the installed Python version 153 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 154 | "source /config/app/pyenv_setup.sh && python -m venv /config/app/venv" 155 | 156 | # Update PATH to include the virtual environment's bin directory 157 | ENV PATH="/config/app/venv/bin:$PATH" 158 | 159 | # Set environment variable to prevent poetry from using keyring 160 | ENV POETRY_NO_KEYRING=1 161 | 162 | # Upgrade pip to the latest version 163 | RUN XDG_CACHE_HOME=/config/app/.cache /bin/bash -c \ 164 | "source /config/app/pyenv_setup.sh && \ 165 | source /config/app/venv/bin/activate && \ 166 | pip install --no-cache-dir --upgrade pip" 167 | 168 | # Install project dependencies using Poetry 169 | RUN XDG_CACHE_HOME=/config/app/.cache \ 170 | POETRY_CACHE_DIR=/config/app/.cache/pypoetry \ 171 | /bin/bash -c "source /config/app/pyenv_setup.sh && \ 172 | source /config/app/venv/bin/activate && \ 173 | pip install --no-cache-dir poetry && \ 174 | poetry install --no-root" 175 | 176 | # Copy the rest of your application code 177 | COPY --chown=abc:abc . /config/app/ 178 | 179 | # Create the logs and recordings directories and set ownership to 'abc' 180 | RUN mkdir -p /config/app/logs && chown -R abc:abc /config/app/logs 181 | RUN mkdir -p /config/app/recordings && chown -R abc:abc /config/app/recordings 182 | 183 | # # Switch back to root to set up the s6-overlay v3 service 184 | USER root 185 | 186 | ENV S6_LOGGING=1 187 | ENV S6_VERBOSITY=2 188 | ENV S6_KEEP_ENV=1 189 | ENV S6_RC_VERBOSE=1 190 | 191 | # Probably don't need, for compositing 192 | # COPY xconf_run /etc/s6-overlay/s6-rc.d/xconf/up 193 | # RUN echo 'oneshot' > /etc/s6-overlay/s6-rc.d/xconf/type 194 | # RUN ln -s ../xconf /etc/s6-overlay/s6-rc.d/user/contents.d/xconf 195 | # COPY ./theme/enable-compositing.desktop /etc/xdg/autostart/enable-compositing.desktop 196 | 197 | 198 | RUN touch /config/app/audit.log && chown abc:abc /config/app/audit.log && chmod 644 /config/app/audit.log 199 | RUN touch /config/app/logs/redis_env.log && chown abc:abc /config/app/logs/redis_env.log && chmod 644 /config/app/logs/redis_env.log 200 | 201 | RUN mkdir -p /config/app/logs/uvicorn && chown -R abc:abc /config/app/logs/uvicorn 202 | 203 | RUN mkdir -p /config/app/celery && chown -R abc:abc /config/app/celery && chmod 744 /config/app/celery 204 | RUN mkdir -p /config/.agentsea && chown -R abc:abc /config/.agentsea 205 | RUN mkdir -p /config/.agentsea/data && chown -R abc:abc /config/.agentsea/data 206 | 207 | # Create the s6-overlay v3 service directory for your application 208 | RUN mkdir -p /etc/s6-overlay/s6-rc.d/uvicorn 209 | 210 | # Create Redis service directory 211 | RUN mkdir -p /etc/s6-overlay/s6-rc.d/redis 212 | 213 | # Copy the s6-overlay v3 run script into the service directory 214 | COPY uvicorn_run /etc/s6-overlay/s6-rc.d/uvicorn/run 215 | 216 | # Copy the s6-overlay v3 run script into the service directory 217 | COPY redis_run /etc/s6-overlay/s6-rc.d/redis/run 218 | 219 | # Make the run script executable 220 | RUN chmod +x /etc/s6-overlay/s6-rc.d/uvicorn/run 221 | 222 | # Make the run script executable for redis 223 | RUN chmod +x /etc/s6-overlay/s6-rc.d/redis/run 224 | 225 | # Create the 'type' file for the service 226 | RUN echo 'longrun' > /etc/s6-overlay/s6-rc.d/uvicorn/type 227 | 228 | # Create the 'type' file for Redis service 229 | RUN echo 'longrun' > /etc/s6-overlay/s6-rc.d/redis/type 230 | 231 | # Enable the service by creating a symlink in the 'user' bundle 232 | RUN ln -s ../uvicorn /etc/s6-overlay/s6-rc.d/user/contents.d/uvicorn 233 | 234 | # Enable Redis service by creating a symlink in the 'user' bundle 235 | RUN ln -s ../redis /etc/s6-overlay/s6-rc.d/user/contents.d/redis 236 | 237 | RUN chown -R abc:abc /config/.agentsea/data 238 | 239 | COPY conf/kasm/run /etc/s6-overlay/s6-rc.d/svc-kasmvnc/run 240 | 241 | # Create the 'data' directory for the service and set the user 242 | # RUN mkdir -p /etc/s6-overlay/s6-rc.d/uvicorn/data && \ 243 | # echo 'abc' > /etc/s6-overlay/s6-rc.d/uvicorn/data/user 244 | 245 | RUN echo 'abc' > /etc/s6-overlay/s6-rc.d/uvicorn/user 246 | 247 | # Set the user for Redis service 248 | RUN echo 'abc' > /etc/s6-overlay/s6-rc.d/redis/user 249 | 250 | ENV AGENTSEA_HOME=/config/.agentsea 251 | 252 | # Install extras 253 | RUN apk add --no-cache \ 254 | libreoffice \ 255 | gimp \ 256 | inkscape \ 257 | vlc \ 258 | thunderbird \ 259 | audacity \ 260 | filezilla \ 261 | evolution \ 262 | kodi \ 263 | handbrake \ 264 | openmpi-dev 265 | 266 | RUN set -e; \ 267 | mkdir -p /config/Desktop && \ 268 | \ 269 | ##### 1) Copy the selected launchers if they exist ##### 270 | for file in /usr/share/applications/libreoffice-*.desktop \ 271 | /usr/share/applications/gimp*.desktop \ 272 | /usr/share/applications/inkscape*.desktop \ 273 | /usr/share/applications/audacity.desktop \ 274 | /usr/share/applications/kodi.desktop \ 275 | /usr/share/applications/firefox.desktop \ 276 | /usr/share/applications/mousepad.desktop; do \ 277 | [ -e "$file" ] || continue; \ 278 | name=$(basename "$file"); \ 279 | target="/config/Desktop/$name"; \ 280 | [ -f "$target" ] || cp "$file" "$target"; \ 281 | done && \ 282 | \ 283 | ##### 2) Final permissions ##### 284 | chmod +x /config/Desktop/*.desktop && \ 285 | chown -R abc:abc /config/Desktop 286 | 287 | # Expose the port uvicorn is running on (if needed) 288 | EXPOSE 8000 289 | 290 | # Expose Redis Port, we don't need to because it should only be used internally but this is there just incase 291 | # EXPOSE 6379 -------------------------------------------------------------------------------- /agentd/server.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import getpass 3 | import logging 4 | import os 5 | import platform 6 | import random 7 | import subprocess 8 | import sys 9 | import threading 10 | import time 11 | import uuid 12 | from datetime import datetime 13 | from typing import Optional 14 | import pyperclip 15 | import requests 16 | 17 | import psutil 18 | import pyautogui 19 | from fastapi import Body, FastAPI, HTTPException, Request 20 | from fastapi.middleware.cors import CORSMiddleware 21 | from fastapi.responses import FileResponse 22 | from pydantic import BaseModel 23 | from taskara.task import Task 24 | 25 | from agentd.util import log_subprocess_output 26 | 27 | from .firefox import ( 28 | gracefully_terminate_firefox, 29 | is_firefox_running, 30 | is_firefox_window_open, 31 | maximize_firefox_window, 32 | ) 33 | from .models import ( 34 | ClickModel, 35 | CoordinatesModel, 36 | DragMouseModel, 37 | MoveMouseModel, 38 | OpenURLModel, 39 | PressKeyModel, 40 | PressKeysModel, 41 | RecordRequest, 42 | RecordResponse, 43 | ScreenshotResponseModel, 44 | ScreenSizeModel, 45 | ScrollModel, 46 | SystemInfoModel, 47 | SystemUsageModel, 48 | TypeTextModel, 49 | StopRequest, 50 | useSecretRequest, 51 | getSecretRequest 52 | ) 53 | from .recording import RecordingSession, lock 54 | 55 | import logging 56 | import logging.config 57 | from .logging_config import LOGGING_CONFIG # or wherever you store the config 58 | 59 | logging.config.dictConfig(LOGGING_CONFIG) 60 | 61 | # Create logger instances 62 | api_logger = logging.getLogger("api") 63 | 64 | current_user: str = getpass.getuser() 65 | api_logger.info(f"current user: {current_user}") 66 | 67 | active_session: Optional[RecordingSession] = None 68 | 69 | app = FastAPI() 70 | 71 | app.add_middleware( 72 | CORSMiddleware, 73 | allow_origins=["*"], 74 | allow_credentials=True, 75 | allow_methods=["*"], 76 | allow_headers=["*"], 77 | ) 78 | 79 | 80 | @app.middleware("http") 81 | async def log_requests(request: Request, call_next): 82 | # Log the request details 83 | api_logger.info(f"Method: {request.method} Path: {request.url.path}") 84 | response = await call_next(request) 85 | return response 86 | 87 | 88 | @app.get("/") 89 | async def root(): 90 | return {"message": "Agent in the shell"} 91 | 92 | 93 | @app.get("/health") 94 | async def health(): 95 | return {"status": "ok"} 96 | 97 | 98 | @app.get("/v1/info", response_model=SystemInfoModel) 99 | async def get_info(): 100 | # Screen size 101 | width, height = pyautogui.size() 102 | screen_size = ScreenSizeModel(x=width, y=height) 103 | 104 | # OS Info 105 | os_info = f"{platform.system()} {platform.release()}" 106 | 107 | # Code Version (Git) 108 | try: 109 | code_version = ( 110 | subprocess.check_output(["git", "rev-parse", "HEAD"]) 111 | .decode("utf-8") 112 | .strip() 113 | ) 114 | except Exception: 115 | code_version = None 116 | 117 | # Last Activity from log 118 | try: 119 | with open("audit.log", "r") as f: 120 | lines = f.readlines() 121 | last_activity_unix = None 122 | if lines: 123 | last_line = lines[-1] 124 | last_activity_str = last_line.split(" - ")[0] 125 | last_activity_datetime = datetime.strptime( 126 | last_activity_str, "%Y-%m-%d %H:%M:%S" 127 | ) 128 | last_activity_unix = int( 129 | time.mktime(last_activity_datetime.timetuple()) 130 | ) 131 | except Exception: 132 | last_activity_unix = None 133 | 134 | return SystemInfoModel( 135 | last_activity_ts=last_activity_unix, 136 | screen_size=screen_size, 137 | os_info=os_info, 138 | code_version=code_version, 139 | ) 140 | 141 | 142 | @app.get("/v1/screen_size") 143 | def get_screen_size() -> ScreenSizeModel: 144 | width, height = pyautogui.size() 145 | return ScreenSizeModel(x=width, y=height) 146 | 147 | 148 | @app.get("/v1/mouse_coordinates") 149 | async def mouse_coordinates() -> CoordinatesModel: 150 | x, y = pyautogui.position() 151 | return CoordinatesModel(x=x, y=y) # type: ignore 152 | 153 | 154 | @app.post("/v1/open_url") 155 | async def open_url(request: OpenURLModel): 156 | try: 157 | firefox_pids = is_firefox_running() 158 | if firefox_pids: 159 | api_logger.info("Firefox is running. Restarting it...") 160 | gracefully_terminate_firefox(firefox_pids) 161 | time.sleep(5) 162 | 163 | api_logger.info("Starting Firefox...") 164 | subprocess.Popen( 165 | [ 166 | "firefox", 167 | request.url, 168 | ], 169 | stdout=sys.stdout, 170 | stderr=sys.stderr, 171 | ) 172 | 173 | while not is_firefox_window_open(): 174 | time.sleep(1) 175 | api_logger.info("Waiting for the Firefox window to open...") 176 | 177 | maximize_firefox_window() 178 | 179 | return {"status": "success"} 180 | 181 | except Exception as e: 182 | return {"status": "error", "message": str(e)} 183 | 184 | 185 | @app.post("/v1/move_mouse") 186 | async def move_mouse_to(request: MoveMouseModel): 187 | try: 188 | tween_func = getattr(pyautogui, request.tween, pyautogui.linear) 189 | pyautogui.moveTo( 190 | request.x, request.y, duration=request.duration, tween=tween_func 191 | ) 192 | return {"status": "success"} 193 | except Exception as e: 194 | return {"status": "error", "message": str(e)} 195 | 196 | 197 | @app.post("/v1/click") 198 | async def click(request: ClickModel): 199 | if request.location: 200 | tween_func = getattr(pyautogui, request.location.tween, pyautogui.linear) 201 | pyautogui.moveTo( 202 | request.location.x, 203 | request.location.y, 204 | duration=request.location.duration, 205 | tween=tween_func, 206 | ) 207 | try: 208 | pyautogui.click(button=request.button) 209 | return {"status": "success"} 210 | except Exception as e: 211 | raise HTTPException(status_code=500, detail=str(e)) 212 | 213 | 214 | @app.post("/v1/double_click") 215 | async def double_click(request: ClickModel): 216 | if request.location: 217 | tween_func = getattr(pyautogui, request.location.tween, pyautogui.linear) 218 | pyautogui.moveTo( 219 | request.location.x, 220 | request.location.y, 221 | duration=request.location.duration, 222 | tween=tween_func, 223 | ) 224 | try: 225 | pyautogui.doubleClick(button=request.button) 226 | return {"status": "success"} 227 | except Exception as e: 228 | raise HTTPException(status_code=500, detail=str(e)) 229 | 230 | 231 | @app.post("/v1/type_text") 232 | async def type_text(request: TypeTextModel): 233 | try: 234 | for char in request.text: 235 | pyautogui.write( 236 | char, 237 | interval=random.uniform(request.min_interval, request.max_interval), 238 | ) 239 | time.sleep(random.uniform(request.min_interval, request.max_interval)) 240 | return {"status": "success"} 241 | except Exception as e: 242 | raise HTTPException(status_code=500, detail=str(e)) 243 | 244 | 245 | @app.post("/v1/press_key") 246 | async def press_key(request: PressKeyModel): 247 | try: 248 | pyautogui.press(request.key) 249 | return {"status": "success"} 250 | except Exception as e: 251 | raise HTTPException(status_code=500, detail=str(e)) 252 | 253 | 254 | @app.post("/v1/hot_key") 255 | async def hot_key(request: PressKeysModel): 256 | try: 257 | pyautogui.hotkey(*request.keys) 258 | return {"status": "success"} 259 | except Exception as e: 260 | raise HTTPException(status_code=500, detail=str(e)) 261 | 262 | 263 | @app.post("/v1/scroll") 264 | async def scroll(request: ScrollModel): 265 | try: 266 | # clicks > 0: scrolls UP 267 | # clicks < 0: scrolls DOWN 268 | pyautogui.scroll(request.clicks) 269 | return {"status": "success"} 270 | except Exception as e: 271 | raise HTTPException(status_code=500, detail=str(e)) 272 | 273 | 274 | @app.post("/v1/drag_mouse") 275 | async def drag_mouse(request: DragMouseModel): 276 | try: 277 | pyautogui.dragTo(request.x, request.y) 278 | return {"status": "success"} 279 | except Exception as e: 280 | raise HTTPException(status_code=500, detail=str(e)) 281 | 282 | 283 | @app.post("/v1/screenshot", response_model=ScreenshotResponseModel) 284 | async def take_screenshot( 285 | count: int = 1, delay: float = 0.0 286 | ) -> ScreenshotResponseModel: 287 | try: 288 | os.environ["DISPLAY"] = ":1.0" 289 | 290 | # Create a directory for screenshots if it doesn't exist 291 | screenshots_dir = "screenshots" 292 | os.makedirs(screenshots_dir, exist_ok=True) 293 | 294 | file_paths = [] 295 | 296 | # Loop for the number of screenshots specified by 'count' 297 | for i in range(count): 298 | # Generate a unique file name based on the current timestamp and index 299 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 300 | file_path = os.path.join( 301 | screenshots_dir, f"screenshot_{timestamp}_{i + 1}.png" 302 | ) 303 | 304 | # Use scrot to take a screenshot with the cursor (-p flag) 305 | subprocess.run(["scrot", "-z", "-p", file_path], check=True) 306 | 307 | file_paths.append(file_path) 308 | 309 | # Delay between screenshots if specified 310 | if i < count - 1: 311 | time.sleep(delay) 312 | 313 | # Now that all screenshots are taken, read, encode, and delete them 314 | encoded_images = [] 315 | 316 | for file_path in file_paths: 317 | # Read and encode the image 318 | with open(file_path, "rb") as image_file: 319 | encoded_image = base64.b64encode(image_file.read()).decode("utf-8") 320 | encoded_images.append(encoded_image) 321 | 322 | # Delete the file after encoding 323 | os.remove(file_path) 324 | 325 | # Return the list of encoded images 326 | response = ScreenshotResponseModel( 327 | status="success", 328 | images=encoded_images, # List of all encoded images 329 | ) 330 | 331 | return response 332 | 333 | except Exception as e: 334 | raise HTTPException(status_code=500, detail=str(e)) 335 | 336 | 337 | @app.post("/v1/exec") 338 | async def exec_command(command: str = Body(..., embed=True)): 339 | try: 340 | # Execute the provided command 341 | result = subprocess.run( 342 | command, 343 | shell=True, 344 | stdout=subprocess.PIPE, 345 | stderr=subprocess.PIPE, 346 | text=True, 347 | ) 348 | 349 | # Check if the command was successful 350 | if result.returncode == 0: 351 | return {"status": "success", "output": result.stdout.strip()} 352 | else: 353 | return { 354 | "status": "error", 355 | "output": result.stderr.strip(), 356 | "return_code": result.returncode, 357 | } 358 | 359 | except Exception as e: 360 | raise HTTPException(status_code=500, detail=str(e)) 361 | 362 | @app.post("/v1/use_secret") 363 | async def use_secret(request: useSecretRequest): 364 | global active_session 365 | api_logger.info(f"using secret {request.name} and applying {request.field}") 366 | try: 367 | # Get the secret 368 | url = f"{request.server_address}/v1/secrets/search" 369 | json_data={"name": request.name} 370 | headers= {"Authorization": f"bearer {request.token}"} 371 | response = requests.post(url, json=json_data, headers=headers) 372 | # Check the response status 373 | if response.status_code != 200: 374 | api_logger.info(f"secret fetch failed, name: {request.name}, status_code: {response.status_code} detail: {response.text}") 375 | raise HTTPException( 376 | status_code=response.status_code, 377 | detail=f"Failed to fetch secret: {response.text}", 378 | ) 379 | secrets = response.json() 380 | secret = secrets["secrets"][0] 381 | api_logger.info(f"secret fetched: {secret['name']}") 382 | event_time = time.time() 383 | try: 384 | #TODO will encrypt secret values in transit. Will want to use a private key in the system env to decrypt. 385 | # We can rotate the private key every so often. We are already using https but would be good to have another layer 386 | # An example where this is useful so you won't see real secret values in the network tab on the browser 387 | try: 388 | password = secret["value"][request.field] 389 | except KeyError: 390 | raise HTTPException( 391 | status_code=400, 392 | detail=f"Field '{request.field}' not found in the secret." 393 | ) 394 | if active_session: 395 | active_session.pause_listeners() 396 | else: 397 | api_logger.error("secret used but without active session") 398 | 399 | for char in password: 400 | pyautogui.write( 401 | char, 402 | # interval=random.uniform(request.min_interval, request.max_interval), 403 | ) 404 | # time.sleep(random.uniform(request.min_interval, request.max_interval)) 405 | pyperclip.copy(password) # TODO consider copy paste instead of writing 406 | api_logger.info("secret Text copied to clipboard.") 407 | 408 | if active_session: 409 | active_session.resume_listeners() 410 | active_session.record_useSecret_action(secret_name=secret['name'], field=request.field, event_time=event_time) 411 | else: 412 | api_logger.error("secret used but without active session") 413 | 414 | return {"status": "success"} 415 | except Exception as e: 416 | if active_session: 417 | active_session.resume_listeners() 418 | raise HTTPException(status_code=500, detail=str(e)) 419 | 420 | except Exception as e: 421 | raise HTTPException(status_code=500, detail=str(e)) 422 | 423 | @app.post("/v1/get_secrets") 424 | async def get_secret(request: getSecretRequest): 425 | api_logger.info(f"geting secrets: {request.model_dump_json()}") 426 | try: 427 | # Get the secret 428 | url = f"{request.server_address}/v1/secrets" 429 | headers= {"Authorization": f"bearer {request.token}"} 430 | response = requests.get(url, headers=headers) 431 | # Check the response status 432 | try: 433 | response.raise_for_status() 434 | except requests.exceptions.HTTPError as e: 435 | # Extract the status code and response content 436 | if response: 437 | status_code = response.status_code 438 | error_message = response.text 439 | else: 440 | status_code = 500 441 | error_message = f"An unknown error occurred: {str(e)}" 442 | raise HTTPException( 443 | status_code=status_code, 444 | detail=f"Error: {error_message}" 445 | ) 446 | secrets = response.json() 447 | api_logger.info(f"in get secret response is: {secrets}") 448 | result = [{"name": secret["name"], "fields": list(secret["value"].keys())} for secret in secrets["secrets"]] 449 | return result 450 | 451 | except requests.RequestException as e: 452 | # Handle general request exceptions 453 | raise HTTPException( 454 | status_code=500, 455 | detail=f"An unknown error occurred: {str(e)}" 456 | ) 457 | 458 | 459 | @app.get("/v1/system_usage", response_model=SystemUsageModel) 460 | async def system_usage(): 461 | cpu_percent = psutil.cpu_percent() 462 | memory = psutil.virtual_memory() 463 | disk = psutil.disk_usage("/") 464 | 465 | return SystemUsageModel( 466 | cpu_percent=cpu_percent, # type: ignore 467 | memory_percent=memory.percent, 468 | disk_percent=disk.percent, 469 | ) 470 | 471 | 472 | ## 473 | ### Demonstrate 474 | ## 475 | 476 | 477 | @app.post("/v1/start_recording", response_model=RecordResponse) 478 | async def start_recording(request: RecordRequest): 479 | global active_session 480 | session_id = str(uuid.uuid4()) 481 | 482 | if not request.description and not request.task_id: 483 | raise HTTPException( 484 | status_code=400, 485 | detail="Either description or task_id must be provided", 486 | ) 487 | 488 | if request.description: 489 | task = Task( 490 | description=request.description, 491 | remote=request.server_address, 492 | auth_token=request.token, 493 | owner_id=request.owner_id, 494 | skill=request.skill_id 495 | ) 496 | else: 497 | tasks = Task.find( 498 | remote=request.server_address, 499 | id=request.task_id, 500 | auth_token=request.token, 501 | owner_id=request.owner_id, 502 | ) 503 | if not tasks: 504 | raise HTTPException(status_code=404, detail="Task not found") 505 | task = tasks[0] 506 | # launching celery worker 507 | command = ["celery", "-A", "agentd.celery_worker", "worker", "--loglevel=debug"] 508 | subProc = subprocess.Popen( 509 | command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True 510 | ) 511 | # starting new thread below to capture worker logs in our stdout for uvicorn 512 | threading.Thread( 513 | target=log_subprocess_output, 514 | args=(subProc.stdout, "celery_worker"), 515 | daemon=True, 516 | ).start() 517 | 518 | with lock: 519 | if active_session: 520 | raise HTTPException( 521 | status_code=400, 522 | detail="A recording session is already active. Stop it first", 523 | ) 524 | session = RecordingSession(id=session_id, task=task) 525 | session.start() 526 | active_session = session 527 | return RecordResponse(task_id=task.id) 528 | 529 | 530 | @app.post("/v1/stop_recording") 531 | async def stop_recording(request: StopRequest): 532 | global active_session 533 | with lock: 534 | if not active_session: 535 | raise HTTPException(status_code=404, detail="Session not found") 536 | active_session.stop(result=request.result, comment=request.comment ) 537 | api_logger.info("Stopped recording session") 538 | 539 | active_session = None 540 | return 541 | 542 | 543 | ## 544 | ### Video Recording 545 | ## 546 | 547 | video_recording_process = None 548 | video_recording_lock = threading.Lock() 549 | video_recordings_dir = "video_recordings" 550 | os.makedirs(video_recordings_dir, exist_ok=True) 551 | 552 | 553 | class VideoRecordRequest(BaseModel): 554 | framerate: int 555 | 556 | 557 | class VideoRecordResponse(BaseModel): 558 | session_id: str 559 | 560 | 561 | class VideoRecordings(BaseModel): 562 | recordings: list[str] 563 | 564 | 565 | class VideoRecordModel(BaseModel): 566 | status: str 567 | file_path: str 568 | 569 | 570 | @app.post("/v1/start_video_recording", response_model=VideoRecordResponse) 571 | async def start_video_recording(request: VideoRecordRequest): 572 | global video_recording_process 573 | with video_recording_lock: 574 | if video_recording_process is not None: 575 | raise HTTPException( 576 | status_code=400, detail="Video recording is already in progress." 577 | ) 578 | 579 | session_id = str(uuid.uuid4()) 580 | file_path = os.path.join(video_recordings_dir, f"{session_id}.mp4") 581 | 582 | video_recording_process = subprocess.Popen( 583 | [ 584 | "ffmpeg", 585 | "-video_size", 586 | "1280x800", # TODO we need to make this configurable like framerate 587 | "-framerate", 588 | f"{request.framerate}", 589 | "-f", 590 | "x11grab", 591 | "-i", 592 | ":1", 593 | file_path, 594 | ] 595 | ) 596 | 597 | return VideoRecordResponse(session_id=session_id) 598 | 599 | 600 | @app.post("/v1/stop_video_recording", response_model=VideoRecordModel) 601 | async def stop_video_recording(): 602 | global video_recording_process 603 | with video_recording_lock: 604 | if video_recording_process is None: 605 | raise HTTPException( 606 | status_code=400, detail="No video recording in progress." 607 | ) 608 | 609 | video_recording_process.terminate() 610 | video_recording_process = None 611 | 612 | session_id = str(uuid.uuid4()) 613 | file_path = os.path.join(video_recordings_dir, f"{session_id}.mp4") 614 | 615 | return VideoRecordModel(status="success", file_path=file_path) 616 | 617 | 618 | @app.get("/v1/video_recordings", response_model=VideoRecordings) 619 | async def list_video_recordings(): 620 | recordings = os.listdir(video_recordings_dir) 621 | return VideoRecordings(recordings=recordings) 622 | 623 | 624 | @app.get("/v1/video_recordings/{session_id}", response_class=FileResponse) 625 | async def get_video_recording(session_id: str): 626 | file_path = os.path.join(video_recordings_dir, f"{session_id}.mp4") 627 | if not os.path.exists(file_path): 628 | raise HTTPException(status_code=404, detail="Recording not found.") 629 | 630 | return FileResponse(file_path, media_type="video/mp4", filename=f"{session_id}.mp4") 631 | 632 | 633 | @app.delete("/v1/video_recordings/{session_id}", response_model=VideoRecordModel) 634 | async def delete_video_recording(session_id: str): 635 | file_path = os.path.join(video_recordings_dir, f"{session_id}.mp4") 636 | if not os.path.exists(file_path): 637 | raise HTTPException(status_code=404, detail="Recording not found.") 638 | 639 | os.remove(file_path) 640 | return VideoRecordModel(status="success", file_path=file_path) 641 | --------------------------------------------------------------------------------