├── __init__.py ├── tests ├── __init__.py ├── run.sh ├── util.py ├── test_schema.py ├── test_grammar.py ├── test_gbdhash.py ├── test_initializer.py ├── test_db_nonunique_features.py ├── test_querybuilder.py ├── test_db_unique_features.py └── test_api.py ├── gbd_core ├── __init__.py ├── config.py ├── contexts.py ├── util_argparse.py ├── util.py ├── query.py ├── grammar.py ├── api.py ├── database.py └── schema.py ├── gbd_init ├── __init__.py ├── gbdhash.py ├── initializer.py ├── instance_transformers.py └── feature_extractors.py ├── gbd_server ├── __init__.py ├── static │ ├── img │ │ ├── gbd_logo.jpg │ │ ├── gbd_logo.png │ │ └── gbd_logo_small.png │ ├── main.css │ └── w3.js ├── templates │ └── index.html └── server.py ├── update_tool.sh ├── MANIFEST.in ├── docker ├── build.sh ├── Dockerfile.gbd ├── Dockerfile.nginx ├── entrypoint.nginx.sh ├── docker-compose.yml └── configs │ ├── nginx.conf │ └── nginx.https.conf ├── CITATION.cff ├── .gitignore ├── default_config.toml ├── .gitattributes ├── pyproject.toml ├── setup.py.backup ├── LICENSE ├── .gitlab-ci.yml ├── .github └── workflows │ ├── gh-pages-apidoc.yml │ └── docker-image.yml ├── README.md └── gbd.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gbd_core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gbd_init/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gbd_server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/run.sh: -------------------------------------------------------------------------------- 1 | PYTHONPATH=.. python3 -m unittest *.py 2 | -------------------------------------------------------------------------------- /gbd_server/static/img/gbd_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Udopia/gbd/HEAD/gbd_server/static/img/gbd_logo.jpg -------------------------------------------------------------------------------- /gbd_server/static/img/gbd_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Udopia/gbd/HEAD/gbd_server/static/img/gbd_logo.png -------------------------------------------------------------------------------- /gbd_server/static/img/gbd_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Udopia/gbd/HEAD/gbd_server/static/img/gbd_logo_small.png -------------------------------------------------------------------------------- /update_tool.sh: -------------------------------------------------------------------------------- 1 | sudo rm -rf dist/ 2 | # sudo python3 setup.py develop sdist bdist_wheel 3 | # twine upload dist/* 4 | sudo python3 -m pip install --upgrade build twine 5 | sudo python3 -m build 6 | twine upload dist/* 7 | sudo rm -Rf gbd_tools.egg-info dist build 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include gbd_server/templates/*.html 4 | include gbd_server/static/img/*.png 5 | include gbd_server/static/img/*.jpg 6 | include gbd_server/static/*.css 7 | include gbd_server/static/*.js 8 | include default_config.toml 9 | 10 | -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ]; then 4 | echo "Usage: $0 [ nginx | gbd | ... ]" 5 | exit 0 6 | fi 7 | 8 | if [ ! -e Dockerfile.$1 ]; then 9 | echo "Dockerfile.$1 not found" 10 | exit 1 11 | fi 12 | 13 | docker build --no-cache -t my$1 -f Dockerfile.$1 . 14 | -------------------------------------------------------------------------------- /docker/Dockerfile.gbd: -------------------------------------------------------------------------------- 1 | FROM python:slim 2 | 3 | #ENV GBD_DB=/gbd/meta.db:/gbd/base.db:/gbd/gate.db 4 | ENV GBD_LOGS=/raid/gbd/logs 5 | ENV GBD_PORT=44071 6 | 7 | RUN apt-get update -y \ 8 | && apt-get install -y wget \ 9 | && apt-get clean \ 10 | && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 11 | 12 | RUN pip install --no-cache-dir gbd-tools 13 | 14 | WORKDIR /gbd 15 | 16 | EXPOSE 44071 17 | 18 | ENTRYPOINT [ "gbd", "serve" ] 19 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Iser" 5 | given-names: "Ashlin" 6 | orcid: "https://orcid.org/0000-0003-2904-232X" 7 | - family-names: "Jabs" 8 | given-names: "Christoph" 9 | orcid: "https://orcid.org/0000-0003-3532-696X" 10 | title: "GBD Tools" 11 | version: 4.7.0 12 | doi: 10.5281/zenodo.10213944 13 | date-released: 2023-11-28 14 | url: "https://github.com/Udopia/gbd" 15 | -------------------------------------------------------------------------------- /docker/Dockerfile.nginx: -------------------------------------------------------------------------------- 1 | FROM nginx:alpine 2 | 3 | ENV VIRTUAL_HOST=localhost 4 | ENV AWSTATS_USER=statsuser 5 | ENV AWSTATS_PASS=stats1234 6 | 7 | RUN apk add --no-cache bash awstats apache2-utils 8 | 9 | WORKDIR /awstats 10 | RUN mkdir -p /awstats/www 11 | 12 | COPY configs/nginx.https.conf /etc/nginx/nginx.conf 13 | COPY entrypoint.nginx.sh /entrypoint.nginx.sh 14 | COPY configs/awstats.conf /etc/awstats/awstats.conf 15 | 16 | EXPOSE 80 17 | 18 | CMD [ "/entrypoint.nginx.sh" ] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | *.xls 3 | *.pyc 4 | 5 | # Compiled class file 6 | *.class 7 | 8 | # Log file 9 | *.log 10 | 11 | # BlueJ files 12 | *.ctxt 13 | 14 | # Mobile Tools for Java (J2ME) 15 | .mtj.tmp/ 16 | 17 | # Package Files # 18 | *.jar 19 | *.war 20 | *.nar 21 | *.ear 22 | *.zip 23 | *.tar.gz 24 | *.rar 25 | 26 | # individual files and folders 27 | gbd_server/gbd-server-log* 28 | cli_config/* 29 | server/server_config/* 30 | */default_config/* 31 | server/cache/* 32 | .vscode/ 33 | gbd-server-logs/* 34 | .eggs/* 35 | build/ 36 | dist/ 37 | *.egg-info/ 38 | -------------------------------------------------------------------------------- /docker/entrypoint.nginx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Configures nginx and starts it 4 | sed -i "s/__VIRTUAL_HOST__/$VIRTUAL_HOST/g" /etc/nginx/nginx.conf 5 | 6 | nginx 7 | 8 | 9 | # Configures awstats user and password 10 | htpasswd -cb /awstats/htpasswd $AWSTATS_USER $AWSTATS_PASS 11 | 12 | 13 | # Configures cron and starts it 14 | /usr/bin/awstats_buildstaticpages.pl -config=$VIRTUAL_HOST -update -dir=/awstats/www 15 | printf "#!/bin/bash\n/usr/bin/awstats_buildstaticpages.pl -config=$VIRTUAL_HOST -update -dir=/awstats/www" > /etc/periodic/15min/awstats 16 | chmod +x /etc/periodic/15min/awstats 17 | ln -fs /usr/share/zoneinfo/Europe/Berlin /etc/localtime 18 | 19 | crond -f -l 8 -------------------------------------------------------------------------------- /default_config.toml: -------------------------------------------------------------------------------- 1 | [contexts] 2 | default = "cnf" 3 | cnf = { suffix = ".cnf", idfunc = "cnf_hash", description = "DIMACS Conjunctive Normal Form (CNF)" } 4 | sancnf = { suffix = ".sanitized.cnf", idfunc = "cnf_hash", description = "Sanitized CNF" } 5 | kis = { suffix = ".kis", idfunc = "cnf_hash", description = "k-Independent Set Problem Graph" } 6 | opb = { suffix = ".opb", idfunc = "opb_hash", description = "Pseudo-Boolean Optimization Problem" } 7 | wcnf = { suffix = ".wcnf", idfunc = "wcnf_hash", description = "Weighted CNF (WCNF)" } 8 | wecnf = { suffix = ".wecnf", idfunc = "cnf_hash", description = "Weighted Extended CNF (WECNF)" } 9 | 10 | [transformers] 11 | 12 | [extractors] -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto eol=lf 3 | 4 | *.cs text diff=csharp 5 | *.java text diff=java 6 | *.html text diff=html 7 | *.css text 8 | *.js text 9 | *.sql text 10 | 11 | *.csproj text merge=union 12 | *.sln text merge=union eol=lf 13 | 14 | *.docx diff=astextplain 15 | *.DOCX diff=astextplain 16 | 17 | # absolute paths are ok, as are globs 18 | /**/postinst* text eol=lf 19 | 20 | # paths that don't start with / are treated relative to the .gitattributes folder 21 | relative/path/*.txt text eol=lf 22 | 23 | *.png binary 24 | *.jpg binary 25 | *.jpeg binary 26 | *.gif binary 27 | *.ico binary 28 | 29 | gbd_server/static/css/* linguist-vendored 30 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | nginx: 4 | image: udopia/nginx 5 | #image: mynginx 6 | environment: 7 | - VIRTUAL_HOST=benchmark-database.de 8 | - AWSTATS_USER=statsuser 9 | - AWSTATS_PASS=stats1234 10 | ports: 11 | - 80:80 12 | - 443:443 13 | restart: always 14 | volumes: 15 | - /home/iser/nginx/ssl:/etc/nginx/ssl 16 | - /home/iser/nginx/ssl/bot:/etc/nginx/ssl/bot 17 | 18 | gbd: 19 | depends_on: 20 | - nginx 21 | image: udopia/gbd 22 | #image: mygbd 23 | #environment: 24 | #- GBD_DB=/gbd/meta.db:/gbd/base.db:/gbd/gate.db 25 | ports: 26 | - 44071:44071 27 | volumes: 28 | - /home/iser/gbd:/raid/gbd:ro 29 | restart: always -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "gbd_tools" 7 | version = "5.0.1" 8 | description = "GBD Tools: Maintenance and Distribution of Benchmark Instances and their Attributes" 9 | readme = "README.md" 10 | license-files = ["LICENSE"] 11 | requires-python = ">=3.6" 12 | authors = [{ name = "Ashlin Iser", email = "iser@kit.edu" }] 13 | urls = { Homepage = "https://github.com/Udopia/gbd" } 14 | classifiers = ["Programming Language :: Python :: 3"] 15 | dependencies = ["flask", "tatsu", "polars", "waitress", "pebble", "gbdc"] 16 | scripts = { gbd = "gbd:main" } 17 | 18 | [tool.setuptools] 19 | include-package-data = true 20 | py-modules = ["gbd"] 21 | packages = ["gbd_core", "gbd_init", "gbd_server"] 22 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | 4 | def get_random_clause(max_len=10, max_vars=30): 5 | return ' '.join([str(random.randint(-max_vars, max_vars)) for _ in range(random.randint(0, max_len))]) + ' 0' 6 | 7 | def get_random_formula(max_num=50): 8 | return '\n'.join([get_random_clause() for _ in range(random.randint(0, max_num))]) + '\n' 9 | 10 | def get_random_unique_filename(prefix='random', suffix='.cnf'): 11 | filename = prefix + suffix 12 | while os.path.exists(filename): 13 | filename = '{}{}{}'.format(prefix, random.randint(0, 1000), suffix) 14 | return filename 15 | 16 | def get_random_cnffile(max_num=50): 17 | filename = get_random_unique_filename() 18 | with open(filename, 'w') as f: 19 | f.write('p cnf {} {}\n'.format(random.randint(1, 100), random.randint(1, 100))) 20 | f.write(get_random_formula(max_num)) 21 | return filename -------------------------------------------------------------------------------- /setup.py.backup: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='gbd_tools', 4 | version='4.9.8', 5 | description='GBD Tools: Maintenance and Distribution of Benchmark Instances and their Attributes', 6 | long_description=open('README.md', 'rt').read(), 7 | long_description_content_type="text/markdown", 8 | url='https://github.com/Udopia/gbd', 9 | author='Ashlin Iser', 10 | author_email='markus.iser@kit.edu', 11 | packages=[ 12 | "gbd_core", 13 | "gbd_init", 14 | "gbd_server" 15 | ], 16 | scripts=[ 17 | "gbd.py" 18 | ], 19 | include_package_data=True, 20 | setup_requires=[ 21 | 'wheel', 22 | 'setuptools' 23 | ], 24 | install_requires=[ 25 | 'flask', 26 | 'tatsu', 27 | 'pandas', 28 | 'waitress', 29 | 'pebble', 30 | 'gbdc' 31 | ], 32 | install_obsoletes=['global-benchmark-database-tool'], 33 | classifiers=[ 34 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 35 | "Programming Language :: Python :: 3" 36 | ], 37 | entry_points={ 38 | "console_scripts": [ 39 | "gbd = gbd:main" 40 | ] 41 | } 42 | ) 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - build-image 3 | - tag-image 4 | 5 | 6 | variables: 7 | BUILD_IMAGE: qpr-registry.iti.kit.edu/gbd/gbd:$CI_COMMIT_SHA 8 | RELEASE_IMAGE: qpr-registry.iti.kit.edu/gbd/gbd:latest 9 | 10 | build-image: 11 | image: docker:stable 12 | services: 13 | - docker:dind 14 | stage: build-image 15 | tags: 16 | - docker 17 | rules: 18 | - if: '$CI_COMMIT_BRANCH == "master"' 19 | - if: '$CI_COMMIT_BRANCH == "develop"' 20 | before_script: 21 | - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY 22 | after_script: 23 | - docker logout $CI_REGISTRY 24 | script: 25 | - docker build -t $BUILD_IMAGE . 26 | - docker push $BUILD_IMAGE 27 | 28 | tag-image: 29 | image: docker:stable 30 | services: 31 | - docker:dind 32 | stage: tag-image 33 | tags: 34 | - docker 35 | rules: 36 | - if: '$CI_COMMIT_BRANCH == "master"' 37 | before_script: 38 | - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY 39 | after_script: 40 | - docker logout $CI_REGISTRY 41 | script: 42 | - docker pull $BUILD_IMAGE 43 | - docker tag $BUILD_IMAGE $RELEASE_IMAGE 44 | - docker push $RELEASE_IMAGE 45 | 46 | -------------------------------------------------------------------------------- /docker/configs/nginx.conf: -------------------------------------------------------------------------------- 1 | events {} 2 | 3 | http { 4 | include /etc/nginx/mime.types; 5 | default_type application/octet-stream; 6 | sendfile on; 7 | 8 | proxy_http_version 1.1; 9 | proxy_buffering off; 10 | proxy_set_header Host $host; 11 | proxy_set_header Upgrade $http_upgrade; 12 | proxy_set_header Connection "Upgrade"; 13 | proxy_set_header X-Real-IP $remote_addr; 14 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 15 | proxy_set_header X-Forwarded-Host $host; 16 | proxy_set_header X-Forwarded-Port $server_port; 17 | proxy_set_header Proxy ""; 18 | log_format combined_ip '$http_x_forwarded_for - $remote_user [$time_local] ' 19 | '"$request" $status $body_bytes_sent ' 20 | '"$http_referer" "$http_user_agent"'; 21 | 22 | server { 23 | listen 80; 24 | access_log /awstats/access.log combined_ip; 25 | 26 | location / { 27 | #proxy_pass http://127.0.0.1:44071; 28 | proxy_pass http://gbd:44071; 29 | } 30 | 31 | location /stats { 32 | alias /awstats/www; 33 | index awstats.__VIRTUAL_HOST__.html index.html; 34 | try_files $uri $uri/ index.html; 35 | auth_basic "Restricted area"; 36 | auth_basic_user_file /awstats/htpasswd; 37 | access_log off; 38 | 39 | location /stats/icon { 40 | alias /usr/lib/awstats/icon; 41 | } 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /.github/workflows/gh-pages-apidoc.yml: -------------------------------------------------------------------------------- 1 | name: apidocs 2 | on: 3 | push: 4 | branches: [ "main" ] 5 | 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@master 12 | - name: Set up Python 3.8 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: 3.8 16 | 17 | - name: Install requirements for documentation generation 18 | run: | 19 | python -m pip install --upgrade pip setuptools wheel 20 | python -m pip install docutils pydoctor 21 | 22 | - name: Generate API documentation with pydoctor 23 | run: | 24 | 25 | # Run pydoctor build 26 | pydoctor \ 27 | --project-name=gbd \ 28 | --project-url=https://github.com/$GITHUB_REPOSITORY \ 29 | --html-viewsource-base=https://github.com/$GITHUB_REPOSITORY/tree/$GITHUB_SHA \ 30 | --make-html \ 31 | --html-output=./apidocs \ 32 | --project-base-dir="$(pwd)" \ 33 | --docformat=restructuredtext \ 34 | --intersphinx=https://docs.python.org/3/objects.inv \ 35 | ./gbd_core 36 | 37 | - name: Push API documentation to Github Pages 38 | uses: peaceiris/actions-gh-pages@v3 39 | with: 40 | github_token: ${{ secrets.GITHUB_TOKEN }} 41 | publish_dir: ./apidocs 42 | commit_message: "Generate API documentation" -------------------------------------------------------------------------------- /gbd_core/config.py: -------------------------------------------------------------------------------- 1 | import toml 2 | import os 3 | import importlib.resources as pkg_resources 4 | 5 | ### Default Context 6 | default = "cnf" 7 | 8 | ### Load Configuration from Files 9 | def load_config(default_config_path, user_config_path=None): 10 | # Load the default configuration file 11 | with pkg_resources.open_text('gbd_tools', default_config_path) as f: 12 | config = toml.load(f) 13 | 14 | # If a user configuration file is provided, load it and update the config 15 | if user_config_path and os.path.exists(user_config_path): 16 | with open(user_config_path, 'r') as f: 17 | user_config = toml.load(f) 18 | config.update(user_config) 19 | 20 | return config 21 | 22 | ### Convert ConfigParser to Dictionary 23 | def config_to_dict(config): 24 | config_dict = {} 25 | for context, details in config['contexts'].items(): 26 | config_dict[context] = { 27 | "description": details["description"], 28 | "suffix": details["suffix"], 29 | "idfunc": globals()[details["idfunc"]], 30 | } 31 | return config_dict 32 | 33 | ### Paths to Configuration Files 34 | default_config_path = "default_config.toml" 35 | user_config_path = "user_config.toml" # Adjust this path as needed 36 | 37 | ### Load and Convert Configuration 38 | config_parser = load_config(default_config_path, user_config_path) 39 | config = config_to_dict(config_parser) -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | # release: 5 | # types: [ published ] 6 | push: 7 | branches: [ "main" ] 8 | # pull_request: 9 | # branches: [ "main" ] 10 | 11 | jobs: 12 | 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v1 19 | - name: Login to DockerHub Registry 20 | run: echo "${{ secrets.DOCKERHUB_PASSWORD }}" | docker login -u udopia --password-stdin 21 | # - name: Get the version 22 | # id: vars 23 | # run: echo ::set-output name=tag::$(echo ${GITHUB_REF:10}) 24 | - name: Build the tagged Docker GBD image 25 | run: docker build docker/ --file docker/Dockerfile.gbd --tag udopia/gbd:${{ github.sha }} 26 | - name: Push the tagged Docker GBD image 27 | run: docker push udopia/gbd:${{ github.sha }} 28 | - name: Build the latest Docker GBD image 29 | run: docker build docker/ --file docker/Dockerfile.gbd --tag udopia/gbd:latest 30 | - name: Push the latest Docker GBD image 31 | run: docker push udopia/gbd:latest 32 | - name: Build the tagged Docker NGINX image 33 | run: docker build docker/ --file docker/Dockerfile.nginx --tag udopia/nginx:${{ github.sha }} 34 | - name: Push the tagged Docker NGINX image 35 | run: docker push udopia/nginx:${{ github.sha }} 36 | - name: Build the latest Docker NGINX image 37 | run: docker build docker/ --file docker/Dockerfile.nginx --tag udopia/nginx:latest 38 | - name: Push the latest Docker NGINX image 39 | run: docker push udopia/nginx:latest 40 | 41 | -------------------------------------------------------------------------------- /docker/configs/nginx.https.conf: -------------------------------------------------------------------------------- 1 | events {} 2 | 3 | http { 4 | include /etc/nginx/mime.types; 5 | default_type application/octet-stream; 6 | sendfile on; 7 | 8 | proxy_http_version 1.1; 9 | proxy_buffering off; 10 | proxy_set_header Host $host; 11 | proxy_set_header Upgrade $http_upgrade; 12 | proxy_set_header Connection "Upgrade"; 13 | proxy_set_header X-Real-IP $remote_addr; 14 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 15 | proxy_set_header X-Forwarded-Host $host; 16 | proxy_set_header X-Forwarded-Port $server_port; 17 | proxy_set_header Proxy ""; 18 | log_format combined_ip '$http_x_forwarded_for - $remote_user [$time_local] ' 19 | '"$request" $status $body_bytes_sent ' 20 | '"$http_referer" "$http_user_agent"'; 21 | 22 | server { 23 | listen 80; 24 | listen [::]:80; 25 | access_log /awstats/access.log combined_ip; 26 | 27 | location ^~ /.well-known { 28 | root /etc/nginx/ssl/bot; 29 | } 30 | 31 | location / { 32 | return 301 https://$host$request_uri; 33 | } 34 | } 35 | 36 | server { 37 | listen 443 ssl http2; 38 | listen [::]:443 ssl http2; 39 | access_log /awstats/access.log combined_ip; 40 | 41 | ssl_certificate /etc/nginx/ssl/fullchain.pem; 42 | ssl_certificate_key /etc/nginx/ssl/privkey.pem; 43 | 44 | location / { 45 | proxy_pass http://gbd:44071; 46 | } 47 | 48 | location /stats { 49 | alias /awstats/www; 50 | index awstats.__VIRTUAL_HOST__.html index.html; 51 | try_files $uri $uri/ index.html; 52 | auth_basic "Restricted area"; 53 | auth_basic_user_file /awstats/htpasswd; 54 | access_log off; 55 | 56 | location /stats/icon { 57 | alias /usr/lib/awstats/icon; 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /tests/test_schema.py: -------------------------------------------------------------------------------- 1 | ##!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import unittest 6 | import sqlite3 7 | 8 | from gbd_core.database import Database 9 | from gbd_core.schema import Schema 10 | 11 | from tests import util 12 | 13 | class SchemaTestCase(unittest.TestCase): 14 | 15 | def setUp(self) -> None: 16 | self.file = util.get_random_unique_filename('test', '.db') 17 | sqlite3.connect(self.file).close() 18 | self.name = Schema.dbname_from_path(self.file) 19 | self.db = Database([self.file], verbose=False) 20 | return super().setUp() 21 | 22 | def tearDown(self) -> None: 23 | if os.path.exists(self.file): 24 | os.remove(self.file) 25 | return super().tearDown() 26 | 27 | def test_create_db(self): 28 | self.assertTrue(Schema.is_database(self.file)) 29 | self.assertEqual(len(self.db.get_databases()), 1) 30 | self.assertTrue(self.db.dexists(self.name)) 31 | 32 | def test_create_unique_feature(self): 33 | FEAT = "featA" 34 | self.db.create_feature(FEAT, default_value="empty") 35 | self.assertIn(FEAT, self.db.get_features()) 36 | self.assertIn("features", self.db.get_tables()) 37 | finfo = self.db.find(FEAT) 38 | self.assertEqual(finfo.table, "features") 39 | self.assertEqual(finfo.column, FEAT) 40 | self.assertEqual(finfo.default, "empty") 41 | self.assertEqual(finfo.database, self.name) 42 | 43 | def test_create_nonunique_feature(self): 44 | FEAT = "featB" 45 | self.db.create_feature(FEAT, default_value=None) 46 | self.assertIn(FEAT, self.db.get_features()) 47 | self.assertIn("features", self.db.get_tables()) 48 | finfo = self.db.find(FEAT) 49 | self.assertEqual(finfo.table, FEAT) 50 | self.assertEqual(finfo.column, "value") 51 | self.assertEqual(finfo.default, None) 52 | self.assertEqual(finfo.database, self.name) 53 | 54 | -------------------------------------------------------------------------------- /gbd_server/static/main.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --background-color-1: #FFD497; 3 | --background-color-2: #E8E6EF; 4 | --text-color: #000000; 5 | --link-color-1: #823329; 6 | --link-color-2: #000000; 7 | --border-color: #274060; 8 | } 9 | 10 | html, body { 11 | height: 100%; 12 | margin: 0; 13 | } 14 | 15 | ul li { 16 | padding: 0.3em; 17 | } 18 | 19 | .main { 20 | width: 100%; 21 | height: 100%; 22 | display: flex; 23 | flex-direction: column; 24 | flex-wrap: nowrap; 25 | background:var(--background-color-1); 26 | color:var(--text-color); 27 | font-family:Arial,Helvetica,sans-serif; 28 | } 29 | 30 | header { 31 | flex-shrink: 0; 32 | border-bottom: 3px solid var(--border-color); 33 | } 34 | 35 | header > img { 36 | float: left; 37 | margin: 1em; 38 | filter: drop-shadow(-11px -7px 3px var(--border-color)); 39 | } 40 | 41 | header > .form { 42 | float: left; 43 | margin: 1em; 44 | } 45 | 46 | header > .help { 47 | padding: 1em; 48 | } 49 | 50 | header > .help > fieldset > ul { 51 | margin: 0em; 52 | } 53 | 54 | fieldset { 55 | border: 2px solid var(--border-color); 56 | } 57 | 58 | .content { 59 | flex-grow: 1; 60 | overflow: auto; 61 | min-height: 2em; 62 | background:var(--background-color-2); 63 | padding: 2px; 64 | } 65 | 66 | footer { 67 | padding: 1em; 68 | flex-shrink: 0; 69 | border-top: 3px solid var(--border-color); 70 | } 71 | 72 | a:link { color:var(--link-color-1); text-decoration:none; } 73 | a:visited { color:var(--link-color-1); text-decoration:none; } 74 | a:hover { color:var(--link-color-2); } 75 | a.active { color:var(--link-color-2); } 76 | 77 | input.query { 78 | padding: 4px 8px; 79 | margin: 12px 4px; 80 | width: 600px; 81 | } 82 | 83 | select.features { 84 | padding: 4px 8px; 85 | margin: 12px 4px; 86 | width: 600px; 87 | } 88 | 89 | button.submit { 90 | padding: 4px 8px; 91 | margin: 12px 4px; 92 | } 93 | 94 | table, th, td { 95 | border:1px solid var(--border-color); 96 | border-collapse: collapse; 97 | padding: 3px 7px; 98 | } 99 | 100 | td { 101 | vertical-align: top; 102 | font-family: monospace; 103 | } 104 | 105 | table th { 106 | background-color:var(--background-color-2); 107 | font-weight: bold; 108 | cursor: default; 109 | } 110 | 111 | table.sortable th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after { 112 | content: " \25B4\25BE" 113 | } 114 | -------------------------------------------------------------------------------- /tests/test_grammar.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | from gbd_core.grammar import Parser, ParserException 5 | 6 | class SchemaTestCase(unittest.TestCase): 7 | 8 | def setUp(self) -> None: 9 | return super().setUp() 10 | 11 | def tearDown(self) -> None: 12 | return super().tearDown() 13 | 14 | def test_query_nesting(self): 15 | parser = Parser("a = 1") 16 | self.assertEqual(parser.get_features(), set(["a"])) 17 | parser = Parser("a = 1 and b = 2") 18 | self.assertEqual(parser.get_features(), set(["a", "b"])) 19 | parser = Parser("a = 1 and (b = 2 or c = 3)") 20 | self.assertEqual(parser.get_features(), set(["a", "b", "c"])) 21 | parser = Parser("(b = 2 or c = 3) and a = 1") 22 | self.assertEqual(parser.get_features(), set(["a", "b", "c"])) 23 | 24 | def test_query_string_constraints(self): 25 | parser = Parser("a = val1") 26 | self.assertEqual(parser.get_features(), set(["a"])) 27 | parser = Parser("a = val1 and b != val2") 28 | self.assertEqual(parser.get_features(), set(["a", "b"])) 29 | parser = Parser("a like val1") 30 | self.assertEqual(parser.get_features(), set(["a"])) 31 | parser = Parser("a like val%") 32 | self.assertEqual(parser.get_features(), set(["a"])) 33 | parser = Parser("a like %val") 34 | self.assertEqual(parser.get_features(), set(["a"])) 35 | parser = Parser("a like %val%") 36 | self.assertEqual(parser.get_features(), set(["a"])) 37 | parser = Parser("a like val% and b unlike val%") 38 | self.assertEqual(parser.get_features(), set(["a", "b"])) 39 | with self.assertRaises(ParserException): 40 | parser = Parser("a = %val%") 41 | 42 | def test_query_arithmetic_constraints(self): 43 | parser = Parser("a = (1 + 2)") 44 | self.assertEqual(parser.get_features(), set(["a"])) 45 | parser = Parser("a = (1 - 2)") 46 | self.assertEqual(parser.get_features(), set(["a"])) 47 | parser = Parser("a = ((1 + 2) / b)") 48 | self.assertEqual(parser.get_features(), set(["a", "b"])) 49 | parser = Parser("a = (b)") 50 | self.assertEqual(parser.get_features(), set(["a", "b"])) 51 | parser = Parser("a = b") 52 | self.assertEqual(parser.get_features(), set(["a"])) 53 | 54 | def test_explicit_context(self): 55 | parser = Parser("c:a = 1") 56 | self.assertEqual(parser.get_features(), set(["c:a"])) 57 | parser = Parser("c:a = 1 and d:b = 2") 58 | self.assertEqual(parser.get_features(), set(["c:a", "d:b"])) -------------------------------------------------------------------------------- /tests/test_gbdhash.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import random 3 | import os 4 | 5 | from gbd_core.contexts import identify 6 | from tests import util 7 | 8 | 9 | class TestGBDHash(unittest.TestCase): 10 | 11 | def setUp(self): 12 | self.reference = util.get_random_formula() 13 | self.ref_file = "reference.cnf" 14 | with open(self.ref_file, 'w') as ref: 15 | ref.write(self.reference) 16 | self.reference_hash = identify(self.ref_file) 17 | 18 | def tearDown(self): 19 | if self.currentResult.wasSuccessful(): 20 | os.remove(self.ref_file) 21 | 22 | def run(self, result=None): 23 | self.currentResult = result 24 | unittest.TestCase.run(self, result) 25 | 26 | def get_random_character(self): 27 | c = chr(random.randint(0, 255)) 28 | return c if not c.isspace() else ' ' 29 | 30 | def get_random_string(self, min_length=0, max_length=20): 31 | return ''.join([self.get_random_character() for _ in range(random.randint(min_length, max_length))]) 32 | 33 | def get_random_whitespace_character(self): 34 | r = random.random() 35 | return '\t' if r < 0.25 else '\r' if r < 0.5 else '\n' if r < 0.75 else ' ' 36 | 37 | def get_random_whitespace(self, min_length=0, max_length=3): 38 | return ''.join([self.get_random_whitespace_character() for _ in range(random.randint(min_length, max_length))]) 39 | 40 | def get_random_header(self, p=0.5): 41 | return "p cnf {} {}\n".format(random.randint(1, 100), random.randint(1, 100)) if random.random() < p else "" 42 | 43 | def get_random_comment(self, p=0.5): 44 | return "c {}\n".format(self.get_random_string()) if random.random() < p else "" 45 | 46 | def test_randomized_variants(self): 47 | for _ in range(100): 48 | variant = self.get_random_whitespace() 49 | variant += self.get_random_comment() 50 | variant += self.get_random_header() 51 | variant += self.get_random_whitespace() 52 | for c in self.reference: 53 | if c.isspace(): 54 | variant += self.get_random_whitespace() 55 | variant += c 56 | if c.isspace(): 57 | variant += self.get_random_whitespace() 58 | variant += self.get_random_whitespace() 59 | 60 | var_file = "variant.cnf" 61 | with open(var_file, 'w') as f: 62 | f.write(variant) 63 | variant_hash = identify(var_file) 64 | if self.reference_hash == variant_hash: 65 | os.remove(var_file) 66 | 67 | self.assertEqual(self.reference_hash, variant_hash) 68 | -------------------------------------------------------------------------------- /gbd_core/contexts.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | from gbd_init.gbdhash import cnf_hash, opb_hash, wcnf_hash 16 | 17 | ### Default Context 18 | default = "cnf" 19 | 20 | ### Configuration of Available Contexts 21 | config = { 22 | "cnf": { 23 | "description": "Conjunctive Normal Form (CNF) in DIMACS format", 24 | "suffix": ".cnf", 25 | "idfunc": cnf_hash, 26 | }, 27 | "sancnf": { 28 | "description": "Sanitized Conjunctive Normal Form (CNF) in DIMACS format", 29 | "suffix": ".sanitized.cnf", 30 | "idfunc": cnf_hash, 31 | }, 32 | "kis": { 33 | "description": "k-Independent Set (KIS) in DIMACS-like graph format", 34 | "suffix": ".kis", 35 | "idfunc": cnf_hash, 36 | }, 37 | "opb": { 38 | "description": "Pseudo-Boolean Optimization Problem in OPB format", 39 | "suffix": ".opb", 40 | "idfunc": opb_hash, 41 | }, 42 | "wecnf": { 43 | "description": "Weighted Extended Conjunctive Normal Form (WECNF)", 44 | "suffix": ".wecnf", 45 | "idfunc": cnf_hash, 46 | }, 47 | "wcnf": { 48 | "description": "MaxSAT instances in WCNF format", 49 | "suffix": ".wcnf", 50 | "idfunc": wcnf_hash, 51 | }, 52 | } 53 | 54 | 55 | def description(context): 56 | return config[context]["description"] 57 | 58 | 59 | def suffixes(context): 60 | return [config[context]["suffix"] + p for p in ["", ".gz", ".lzma", ".xz", ".bz2"]] 61 | 62 | 63 | def idfunc(context): 64 | return config[context]["idfunc"] 65 | 66 | 67 | def contexts(): 68 | return config.keys() 69 | 70 | 71 | def default_context(): 72 | return default 73 | 74 | 75 | def get_context_by_suffix(benchmark): 76 | for context in contexts(): 77 | for suffix in suffixes(context): 78 | if benchmark.endswith(suffix): 79 | return context 80 | return None 81 | 82 | 83 | def identify(path, ct=None): 84 | context = ct or get_context_by_suffix(path) 85 | if context is None: 86 | raise Exception("Unable to associate context: " + path) 87 | else: 88 | idf = idfunc(context) 89 | return idf(path) 90 | -------------------------------------------------------------------------------- /gbd_init/gbdhash.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import io 16 | import hashlib 17 | 18 | import gzip 19 | import bz2 20 | import lzma 21 | 22 | 23 | def open_file(filename, mode): 24 | if filename.endswith(".gz"): 25 | return gzip.open(filename, mode) 26 | elif filename.endswith(".bz2"): 27 | return bz2.open(filename, mode) 28 | elif filename.endswith(".lzma") or filename.endswith(".xz"): 29 | return lzma.open(filename, mode) 30 | else: 31 | return open(filename, mode) 32 | 33 | 34 | try: 35 | from gbdc import opbhash as opb_hash 36 | except ImportError: 37 | 38 | def opb_hash(filename): 39 | raise Exception("Unable to import opbhash. Please install or update gbdc: https://github.com/Udopia/gbdc") 40 | 41 | 42 | try: 43 | from gbdc import gbdhash as cnf_hash 44 | except ImportError: 45 | try: 46 | from gbdhashc import gbdhash as cnf_hash 47 | except ImportError: 48 | 49 | def cnf_hash(filename): 50 | file = open_file(filename, "rb") 51 | buff = io.BufferedReader(file, io.DEFAULT_BUFFER_SIZE * 16) 52 | 53 | space = False 54 | skip = False 55 | start = True 56 | cldelim = True 57 | hash_md5 = hashlib.md5() 58 | 59 | for byte in iter(lambda: buff.read(1), b""): 60 | if not skip and (byte >= b"0" and byte <= b"9" or byte == b"-"): 61 | cldelim = byte == b"0" and (space or start) 62 | start = False 63 | if space: 64 | space = False 65 | hash_md5.update(b" ") 66 | hash_md5.update(byte) 67 | elif byte <= b" ": 68 | space = not start # remember non-leading space characters 69 | skip = skip and byte != b"\n" and byte != b"\r" # comment line ended 70 | else: # byte == b'c' or byte == b'p': 71 | skip = True # do not hash comment and header line 72 | 73 | if not cldelim: 74 | hash_md5.update(b" 0") 75 | 76 | file.close() 77 | 78 | return hash_md5.hexdigest() 79 | 80 | 81 | try: 82 | from gbdc import wcnfhash as wcnf_hash 83 | except ImportError: 84 | 85 | def wcnf_hash(filename): 86 | raise Exception("Unable to import wcnfhash. Please install or update gbdc: https://github.com/Udopia/gbdc") 87 | -------------------------------------------------------------------------------- /tests/test_initializer.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | import polars as pl 5 | import random 6 | import sqlite3 7 | 8 | from gbd_core.database import Database 9 | from gbd_core.schema import Schema 10 | from gbd_core.api import GBD, GBDException 11 | from gbd_init.initializer import Initializer 12 | from gbd_core.contexts import identify 13 | from gbd_init.feature_extractors import init_local, init_features_generic, generic_extractors 14 | 15 | from tests import util 16 | 17 | class InitTestCase(unittest.TestCase): 18 | 19 | def setUp(self) -> None: 20 | self.file = util.get_random_unique_filename('test', '.db') 21 | sqlite3.connect(self.file).close() 22 | self.name = Schema.dbname_from_path(self.file) 23 | self.db = Database([self.file], verbose=False) 24 | self.benchmark = "benchmark.cnf" 25 | self.dir = os.path.dirname(os.path.realpath(self.benchmark)) 26 | with open(self.benchmark, 'w') as file: 27 | file.write(util.get_random_formula(20)) 28 | self.reference_hash = identify(self.benchmark) 29 | return super().setUp() 30 | 31 | def tearDown(self) -> None: 32 | if os.path.exists(self.file): 33 | os.remove(self.file) 34 | if os.path.exists(self.benchmark): 35 | os.remove(self.benchmark) 36 | return super().tearDown() 37 | 38 | def init_random(self, hash, path, limits): 39 | return [ ('random', hash, random.randint(1, 1000)) ] 40 | 41 | def test_init_random(self): 42 | api = GBD([self.file], verbose=False) 43 | rlimits = { 'jobs': 1, 'tlim': 5000, 'mlim': 2000, 'flim': 1000 } 44 | init = Initializer(api, rlimits, self.name, [('random', 0)], self.init_random) 45 | init.create_features() 46 | self.assertTrue(api.feature_exists('random')) 47 | df: pl.DataFrame = pl.DataFrame([(str(n), None) for n in range(100)], schema=["hash", "local"], orient="row") 48 | init.run(df) 49 | df: pl.DataFrame = api.query("random > 0", [], ["random"]) 50 | self.assertEqual(len(df), 100) 51 | 52 | def test_init_local(self): 53 | api = GBD([self.file], verbose=False) 54 | rlimits = { 'jobs': 1, 'tlim': 5000, 'mlim': 2000, 'flim': 1000 } 55 | init_local(api, rlimits, self.dir, self.name) 56 | self.assertTrue(api.feature_exists('local')) 57 | df: pl.DataFrame = api.query("local like %benchmark.cnf", [], ["local"]) 58 | self.assertEqual(len(df), 1) 59 | self.assertEqual(df.to_dicts()[0]['local'], os.path.realpath(self.benchmark)) 60 | self.assertEqual(df.to_dicts()[0]['hash'], self.reference_hash) 61 | 62 | def test_init_cnf_features_generic(self): 63 | api = GBD([self.file], verbose=False) 64 | rlimits = { 'jobs': 1, 'tlim': 5000, 'mlim': 2000, 'flim': 1000 } 65 | init_local(api, rlimits, self.dir, self.name) 66 | df: pl.DataFrame = api.query("local like %benchmark.cnf", [], ["local"]) 67 | for key in generic_extractors.keys(): 68 | if 'cnf' in generic_extractors[key]['contexts']: 69 | init_features_generic(key, api, rlimits, df, self.name) 70 | for feature in generic_extractors[key]['features']: 71 | self.assertTrue(api.feature_exists(feature[0])) 72 | 73 | -------------------------------------------------------------------------------- /gbd_init/initializer.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import multiprocessing 16 | import time 17 | import pebble 18 | from concurrent.futures import as_completed 19 | import polars as pl 20 | 21 | from gbd_core.util import eprint 22 | from gbd_core.api import GBD, GBDException 23 | from gbd_core import util 24 | import gbdc 25 | import os 26 | 27 | 28 | class InitializerException(Exception): 29 | pass 30 | 31 | 32 | class Initializer: 33 | def __init__(self, api: GBD, rlimits: dict, target_db: str, features: list, initfunc): 34 | self.api = api 35 | self.api.database.set_auto_commit(False) 36 | self.target_db = target_db 37 | self.features = features 38 | self.initfunc = initfunc 39 | self.rlimits = rlimits 40 | 41 | def prep_data(self, rec, hash): 42 | return [ 43 | (key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items() if self.api.feature_exists(key) 44 | ] 45 | 46 | def create_features(self): 47 | for name, default in self.features: 48 | self.api.database.create_feature(name, default, self.target_db, True) 49 | self.api.database.commit() 50 | 51 | def save_features(self, result: list): 52 | for attr in result: 53 | name, hashv, value = attr[0], attr[1], attr[2] 54 | self.api.database.set_values(name, value, [hashv], self.target_db) 55 | self.api.database.commit() 56 | 57 | def run(self, instances: pl.DataFrame): 58 | if self.rlimits["jobs"] == 1: 59 | self.init_sequential(instances) 60 | else: 61 | self.init_parallel_pp(instances) 62 | 63 | def init_sequential(self, instances: pl.DataFrame): 64 | for row in instances.iter_rows(named=True): 65 | result = self.initfunc(row["hash"], row["local"], self.rlimits) 66 | self.save_features(result) 67 | 68 | def init_parallel_pp(self, instances: pl.DataFrame): 69 | with pebble.ProcessPool(max_workers=self.rlimits["jobs"], max_tasks=1, context=multiprocessing.get_context("forkserver")) as p: 70 | futures = [p.schedule(self.initfunc, (row["hash"], row["local"], self.rlimits)) for row in instances.iter_rows(named=True)] 71 | for f in as_completed(futures): # , timeout=api.tlim if api.tlim > 0 else None): 72 | try: 73 | result = f.result() 74 | self.save_features(result) 75 | except pebble.ProcessExpired as e: 76 | f.cancel() 77 | util.eprint("{}: {}".format(e.__class__.__name__, e)) 78 | except GBDException as e: # might receive special handling in the future 79 | util.eprint("{}: {}".format(e.__class__.__name__, e)) 80 | except Exception as e: 81 | import traceback 82 | 83 | traceback.print_exc() 84 | util.eprint("{}: {}".format(e.__class__.__name__, e)) 85 | -------------------------------------------------------------------------------- /tests/test_db_nonunique_features.py: -------------------------------------------------------------------------------- 1 | ##!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import unittest 6 | import sqlite3 7 | 8 | from gbd_core.query import GBDQuery 9 | from gbd_core.database import Database, DatabaseException 10 | from gbd_core.schema import Schema 11 | from tests import util 12 | 13 | class DatabaseTestCase(unittest.TestCase): 14 | 15 | feat = "nonunique_feature" 16 | val1 = "value1" 17 | val2 = "value2" 18 | 19 | def setUp(self) -> None: 20 | self.file = util.get_random_unique_filename('test', '.db') 21 | sqlite3.connect(self.file).close() 22 | self.name = Schema.dbname_from_path(self.file) 23 | self.db = Database([self.file], verbose=False) 24 | self.db.create_feature(self.feat, default_value=None) 25 | self.db.set_values(self.feat, self.val1, ["a", "b", "c"]) 26 | self.db.set_values(self.feat, self.val2, ["a", "b", "c"]) 27 | return super().setUp() 28 | 29 | def tearDown(self) -> None: 30 | if os.path.exists(self.file): 31 | os.remove(self.file) 32 | return super().tearDown() 33 | 34 | def query(self, feat, val): 35 | qb = GBDQuery(self.db, "{}={}".format(feat, val)) 36 | q = qb.build_query() 37 | return [ hash for (hash, ) in self.db.query(q) ] 38 | 39 | def dump(self): 40 | import sqlite3 41 | conn = sqlite3.connect(self.file) 42 | for line in conn.iterdump(): 43 | print(line) 44 | conn.close() 45 | 46 | # Test that the feature values are initialized correctly in test setup 47 | def test_feature_values_exist(self): 48 | res = self.query(self.feat, self.val1) 49 | self.assertEqual(len(res), 3) 50 | self.assertSetEqual(set(res), set(["a", "b", "c"])) 51 | res = self.query(self.feat, self.val2) 52 | self.assertEqual(len(res), 3) 53 | self.assertSetEqual(set(res), set(["a", "b", "c"])) 54 | 55 | # Delete specific hash-value pair and check that it is gone and the others are still there 56 | def test_feature_values_delete_hash_value(self): 57 | self.db.delete(self.feat, [ self.val1 ], ["a"]) 58 | res = self.query(self.feat, self.val1) 59 | self.assertEqual(len(res), 2) 60 | self.assertSetEqual(set(res), set(["b", "c"])) 61 | res = self.query(self.feat, self.val2) 62 | self.assertEqual(len(res), 3) 63 | self.assertSetEqual(set(res), set(["a", "b", "c"])) 64 | 65 | # Delete specific hash and check that it is gone and the others are still there 66 | def test_feature_values_delete_hash(self): 67 | self.db.delete(self.feat, [ ], ["a"]) 68 | res = self.query(self.feat, self.val1) 69 | self.assertEqual(len(res), 2) 70 | self.assertSetEqual(set(res), set(["b", "c"])) 71 | res = self.query(self.feat, self.val2) 72 | self.assertEqual(len(res), 2) 73 | self.assertSetEqual(set(res), set(["b", "c"])) 74 | res = self.query(self.feat, "None") 75 | self.assertEqual(len(res), 1) 76 | self.assertSetEqual(set(res), set(["a"])) 77 | 78 | # Delete specific value and check that it is gone and the others are still there 79 | def test_feature_values_delete_value(self): 80 | self.db.delete(self.feat, [ self.val1 ], [ ]) 81 | res = self.query(self.feat, self.val1) 82 | self.assertEqual(len(res), 0) 83 | res = self.query(self.feat, self.val2) 84 | self.assertEqual(len(res), 3) 85 | self.assertSetEqual(set(res), set([ "a", "b", "c" ])) 86 | 87 | # Delete feature 88 | def test_nonunique_feature_delete(self): 89 | self.db.delete_feature(self.feat) 90 | self.assertRaises(DatabaseException, self.db.find, self.feat) -------------------------------------------------------------------------------- /gbd_core/util_argparse.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import argparse 16 | import os 17 | import re 18 | 19 | 20 | def get_gbd_argparser(): 21 | parser = argparse.ArgumentParser(description="GBD Benchmark Database") 22 | parser.add_argument("-d", "--db", type=gbd_db_type, default=os.environ.get("GBD_DB"), help="Specify database to work with") 23 | parser.add_argument("-v", "--verbose", action="store_true", help="Print additional (or diagnostic) information to stderr") 24 | return parser 25 | 26 | 27 | def add_query_and_hashes_arguments(parser: argparse.ArgumentParser): 28 | parser.add_argument("query", help="GBD Query", nargs="?") 29 | parser.add_argument( 30 | "--hashes", help="Explicitly select instances: Hashes can be passed as arguments to this option, but also via .", nargs="*", default=[] 31 | ) 32 | 33 | 34 | def add_resource_limits_arguments(parser: argparse.ArgumentParser): 35 | parser.add_argument("-j", "--jobs", default=1, type=int, help="Set number of parallel jobs") 36 | parser.add_argument( 37 | "-t", 38 | "--tlim", 39 | default=5000, 40 | type=int, 41 | help="Time limit (sec) per instance for 'init' sub-commands (also used for score calculation in 'eval' and 'plot')", 42 | ) 43 | parser.add_argument("-m", "--mlim", default=2000, type=int, help="Memory limit (MB) per instance for 'init' sub-commands") 44 | parser.add_argument("-f", "--flim", default=1000, type=int, help="File size limit (MB) per instance for 'init' sub-commands which create files") 45 | 46 | 47 | ### Argument Types for Input Sanitation in ArgParse Library 48 | def directory_type(path): 49 | if not os.path.isdir(path): 50 | raise argparse.ArgumentTypeError("{0} is not a directory".format(path)) 51 | if os.access(path, os.R_OK): 52 | return os.path.abspath(path) 53 | else: 54 | raise argparse.ArgumentTypeError("{0} is not readable".format(path)) 55 | 56 | 57 | def file_type(path): 58 | if not os.path.isfile(path): 59 | raise argparse.ArgumentTypeError("{0} is not a regular file".format(path)) 60 | if os.access(path, os.R_OK): 61 | return os.path.abspath(path) 62 | else: 63 | raise argparse.ArgumentTypeError("{0} is not readable".format(path)) 64 | 65 | 66 | def column_type(s): 67 | pat = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$") 68 | if not pat.match(s): 69 | raise argparse.ArgumentTypeError('Column "{0}" does not match regular expression {1}'.format(s, pat.pattern)) 70 | return s 71 | 72 | 73 | def key_value_type(s): 74 | tup = s.split("=", 1) 75 | if len(tup) != 2: 76 | raise argparse.ArgumentTypeError("key-value type: {0} must be separated by exactly one = ".format(s)) 77 | return (column_type(tup[0]), tup[1]) 78 | 79 | 80 | def gbd_db_type(dbstr): 81 | if not dbstr: 82 | default = os.environ.get("GBD_DB") 83 | if not default: 84 | raise argparse.ArgumentTypeError("Datasources Missing: Set GBD_DB environment variable (Find databases here: https://benchmark-database.de)") 85 | return default # .split(':') 86 | return dbstr 87 | -------------------------------------------------------------------------------- /tests/test_querybuilder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sqlite3 3 | import os 4 | 5 | from gbd_core.schema import Schema 6 | from gbd_core.database import Database 7 | from gbd_core.query import GBDQuery 8 | 9 | import tests.util as util 10 | 11 | class QueryNonUniqueTestCase(unittest.TestCase): 12 | 13 | feat = "nonuniquefeature" 14 | feat2 = "nonuniquefeature2" 15 | feat3 = "numericfeature" 16 | val1 = "value1" 17 | val2 = "value2" 18 | hashes = [ "a", "b", "c" ] 19 | 20 | def setUp(self) -> None: 21 | self.file1 = util.get_random_unique_filename('test1', '.db') 22 | self.file2 = util.get_random_unique_filename('test2', '.db') 23 | sqlite3.connect(self.file1).close() 24 | sqlite3.connect(self.file2).close() 25 | self.dbname1 = Schema.dbname_from_path(self.file1) 26 | self.dbname2 = Schema.dbname_from_path(self.file2) 27 | self.db = Database([self.file1,self.file2], verbose=False) 28 | 29 | self.db.create_feature(self.feat, default_value=None, target_db=self.dbname1) 30 | self.db.set_values(self.feat, self.val1, self.hashes) 31 | 32 | self.db.create_feature(self.feat, default_value=None, target_db=self.dbname2) 33 | self.db.set_values(self.feat, self.val1, self.hashes[:1], target_db=self.dbname2) 34 | self.db.set_values(self.feat, self.val2, self.hashes, target_db=self.dbname2) 35 | 36 | self.db.create_feature(self.feat2, default_value=None, target_db=self.dbname2) 37 | self.db.set_values(self.feat2, self.val2, self.hashes) 38 | 39 | self.db.create_feature(self.feat3, default_value=0, target_db=self.dbname1) 40 | self.db.set_values(self.feat3, 1, self.hashes[0]) 41 | self.db.set_values(self.feat3, 10, self.hashes[1]) 42 | self.db.set_values(self.feat3, 100, self.hashes[2]) 43 | 44 | return super().setUp() 45 | 46 | def tearDown(self) -> None: 47 | if os.path.exists(self.file1): 48 | os.remove(self.file1) 49 | if os.path.exists(self.file2): 50 | os.remove(self.file2) 51 | return super().tearDown() 52 | 53 | def simple_query(self, feat, val, dbname=None): 54 | if dbname is None: 55 | return self.query("{}={}".format(feat, val)) 56 | else: 57 | return self.query("{}:{}={}".format(dbname, feat, val)) 58 | 59 | def query(self, query): 60 | q = GBDQuery(self.db, query).build_query() 61 | return [ hash for (hash, ) in self.db.query(q) ] 62 | 63 | # def dump(self): 64 | # import sqlite3 65 | # conn = sqlite3.connect(self.file) 66 | # for line in conn.iterdump(): 67 | # print(line) 68 | # conn.close() 69 | 70 | 71 | def test_feature_precedence_rules(self): 72 | res = self.simple_query(self.feat, self.val1) 73 | self.assertEqual(len(res), 3) 74 | res = self.simple_query(self.feat, self.val2) 75 | self.assertEqual(len(res), 0) 76 | res = self.simple_query(self.feat, self.val1, self.dbname1) 77 | self.assertEqual(len(res), 3) 78 | res = self.simple_query(self.feat, self.val2, self.dbname2) 79 | self.assertEqual(len(res), 3) 80 | 81 | def test_string_inequality(self): 82 | res = self.query("{} < {}".format(self.feat, self.val2)) 83 | self.assertEqual(len(res), 3) 84 | res = self.query("{} > {}".format(self.feat, self.val1)) 85 | self.assertEqual(len(res), 0) 86 | 87 | def test_numeric_inequality(self): 88 | res = self.query("{} < 2".format(self.feat3)) 89 | self.assertEqual(len(res), 1) 90 | 91 | def test_multivalued_subselect(self): 92 | res = self.query("{db}:{f} != {v1} and {db}:{f} = {v2}".format(f=self.feat, v1=self.val1, v2=self.val2, db=self.dbname2)) 93 | self.assertEqual(len(res), 2) 94 | 95 | def test_feature_accessible(self): 96 | res = self.simple_query(self.feat2, self.val2) 97 | self.assertEqual(len(res), 3) -------------------------------------------------------------------------------- /tests/test_db_unique_features.py: -------------------------------------------------------------------------------- 1 | ##!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import unittest 6 | import sqlite3 7 | 8 | from gbd_core.query import GBDQuery 9 | from gbd_core.database import Database, DatabaseException 10 | from gbd_core.schema import Schema 11 | 12 | from tests import util 13 | 14 | class DatabaseTestCase(unittest.TestCase): 15 | 16 | feat = "unique_feature" 17 | val1 = "value1" 18 | val2 = "value2" 19 | defv = "empty" 20 | 21 | def setUp(self) -> None: 22 | self.file = util.get_random_unique_filename('test', '.db') 23 | sqlite3.connect(self.file).close() 24 | self.name = Schema.dbname_from_path(self.file) 25 | self.db = Database([self.file], verbose=False) 26 | self.db.create_feature(self.feat, default_value=self.defv) 27 | self.db.set_values(self.feat, self.val1, ["a", "b", "c"]) 28 | return super().setUp() 29 | 30 | def tearDown(self) -> None: 31 | if os.path.exists(self.file): 32 | os.remove(self.file) 33 | return super().tearDown() 34 | 35 | def query(self, feat, val): 36 | qb = GBDQuery(self.db, "{}={}".format(feat, val)) 37 | q = qb.build_query() 38 | return [ hash for (hash, ) in self.db.query(q) ] 39 | 40 | def dump(self): 41 | import sqlite3 42 | conn = sqlite3.connect(self.file) 43 | for line in conn.iterdump(): 44 | print(line) 45 | conn.close() 46 | 47 | # Test that the feature values are initialized correctly in test setup 48 | def test_unique_feature_values_exist(self): 49 | res = self.query(self.feat, self.val1) 50 | self.assertEqual(len(res), 3) 51 | self.assertSetEqual(set(res), set(["a", "b", "c"])) 52 | 53 | # Overwrite one value and check if it is set correctly and that the other values are still there 54 | def test_unique_feature_values_overwrite(self): 55 | self.db.set_values(self.feat, self.val2, ["a"]) 56 | res = self.query(self.feat, self.val1) 57 | self.assertEqual(len(res), 2) 58 | self.assertSetEqual(set(res), set(["b", "c"])) 59 | res2 = self.query(self.feat, self.val2) 60 | self.assertEqual(len(res2), 1) 61 | self.assertSetEqual(set(res2), set(["a"])) 62 | 63 | # Delete specific hash-value pair and check if it is deleted (=set to default value) and that the other values are still there 64 | def test_unique_feature_values_delete_hash_value(self): 65 | self.db.delete(self.feat, [ self.val1 ], ["a"]) 66 | res = self.query(self.feat, self.val1) 67 | self.assertEqual(len(res), 2) 68 | self.assertSetEqual(set(res), set(["b", "c"])) 69 | res = self.query(self.feat, self.defv) 70 | self.assertEqual(len(res), 1) 71 | self.assertSetEqual(set(res), set(["a"])) 72 | 73 | # Delete specific hash and check if it is deleted (=set to default value) and that the other values are still there 74 | def test_unique_feature_values_delete_hash(self): 75 | self.db.delete(self.feat, [ ], ["a"]) 76 | res = self.query(self.feat, self.val1) 77 | self.assertEqual(len(res), 2) 78 | self.assertSetEqual(set(res), set(["b", "c"])) 79 | res = self.query(self.feat, self.defv) 80 | self.assertEqual(len(res), 1) 81 | self.assertSetEqual(set(res), set(["a"])) 82 | 83 | # Delete specific value and check if it is deleted (=set to default value) and that the other values are still there 84 | def test_unique_feature_values_delete_value(self): 85 | self.db.delete(self.feat, [ self.val1 ], [ ]) 86 | res = self.query(self.feat, self.val1) 87 | self.assertEqual(len(res), 0) 88 | res = self.query(self.feat, self.defv) 89 | self.assertEqual(len(res), 3) 90 | self.assertSetEqual(set(res), set([ "a", "b", "c" ])) 91 | 92 | # Delete feature 93 | def test_unique_feature_delete(self): 94 | self.db.delete_feature(self.feat) 95 | self.assertRaises(DatabaseException, self.db.find, self.feat) 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Global Benchmark Database (GBD) 2 | 3 | [![DOI](https://zenodo.org/badge/141396410.svg)](https://doi.org/10.5281/zenodo.17820182) 4 | 5 | GBD is a comprehensive suite of tools for provisioning and sustainably maintaining benchmark instances and their metadata for empirical research on hard algorithmic problem classes. 6 | For an introduction to the GBD concept, the underlying data model, and specific use cases, please refer to our [2024 SAT Tool Paper](https://doi.org/10.4230/LIPIcs.SAT.2024.18). 7 | 8 | ## GBD 5.0 Release Notes 9 | 10 | In addition to several bug fixes and performance improvements, GBD 5.0 no longer depends on Pandas for its interface module. 11 | This simplifies installation and use in various environments. 12 | The faster, more lightweight Polars library is now used for dataframes instead. 13 | Therefore, upgrading to GBD 5.0 requires existing code to be adapted to use Polars dataframes, or Polars dataframes to be explicitly converted to Pandas dataframes (e.g. via df.to_pandas()). 14 | 15 | ## GBD contributes data to your algorithmic evaluations 16 | 17 | GBD provides benchmark instance identifiers, feature extractors, and instance transformers for hard algorithmic problem domains, now including propositional satisfiability (SAT) and optimization (MaxSAT), and pseudo-Boolean optimization (PBO). 18 | 19 | ## GBD solves several problems 20 | 21 | - benchmark instance identification 22 | - identification of equivalence classes of benchmark instances 23 | - distribution of benchmark instances and benchmark metadata 24 | - initialization and maintenance of instance feature databases 25 | - transformation algorithms for benchmark instances 26 | 27 | GBD provides an extensible set of problem domains, feature extractors, and instance transformers. 28 | For a description of those currently supported, see the [GBDC documentation](https://udopia.github.io/gbdc/doc/Index.html). 29 | GBDC is a Python extension module for GBD's performance-critical code (written in C++), maintained in a separate [repository](https://github.com/Udopia/gbdc). 30 | 31 | ## Installation and Configuration 32 | 33 | - Run `pip install gbd-tools` 34 | - Run `pip install gbdc` (optional, installation of extension module gbdc) 35 | - Obtain a GBD database, e.g. download [https://benchmark-database.de/getdatabase/meta.db](https://benchmark-database.de/getdatabase/meta.db). 36 | - Configure your environment by registering paths to databases like this `export GBD_DB=path/to/database1:path/to/database2`. 37 | - Test the command line interface with the `gbd info` and `gbd --help` commands. 38 | 39 | ## GBD Interfaces 40 | 41 | GBD provides the command-line tool `gbd`, the web interface `gbd serve`, and the Python interface `gbd_core.api.GBD`. 42 | 43 | ### GBD Command-Line Interface 44 | 45 | Central commands in gbd are those for data access `gbd get` and database initialization `gbd init`. 46 | See `gbd --help` for more commands. 47 | Once a database is registered in the environment variable `GBD_DB`, the `gbd get` command can be used to access data. 48 | See `gbd get --help` for more information. 49 | `gbd init` provides access to registered feature extractors, such as those provided by the `gdbc` extension module. 50 | All initialization routines can be run in parallel, and resource limits can be set per process. 51 | See `gbd init --help` for more information. 52 | 53 | ### GBD Server 54 | 55 | The GBD server can be started locally with gbd serve. Our instance of the GBD server is hosted at [https://benchmark-database.de/](https://benchmark-database.de/). 56 | You can download benchmark instances and prebuilt feature databases from there. 57 | 58 | ### GBD Python Interface 59 | 60 | The GBD Python interface is used by all programs in the GBD ecosystem. Important here is the query command, which returns GBD data in the form of a Pandas dataframe for further analysis, as shown in the following example. 61 | 62 | ```Python 63 | from gbd_core.api import GBD 64 | with GBD(['path/to/database1', 'path/to/database2', ..] as gbd: 65 | df = gbd.query("family = hardware-bmc", resolve=['verified-result', 'runtime-kissat']) 66 | ``` 67 | 68 | Scripts and use cases of GBD's Python interface are available on [https://udopia.github.io/gbdeval/](https://udopia.github.io/gbdeval/). 69 | The [evaluation demo](https://udopia.github.io/gbdeval/demo_evaluation.html) demonstrates portfolio analysis and subsequent category-wise performance evaluation using the 2023 SAT competition data. 70 | The [prediction demo](https://udopia.github.io/gbdeval/demo_prediction.html) demonstrates category prediction from instance features and subsequent feature importance evaluation. 71 | 72 | -------------------------------------------------------------------------------- /gbd_core/util.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import sys 16 | import os 17 | 18 | 19 | # Thanks to Boris V. for this code https://stackoverflow.com/questions/4675728/redirect-stdout-to-a-file-in-python 20 | from contextlib import contextmanager 21 | 22 | 23 | def fileno(file_or_fd): 24 | fd = getattr(file_or_fd, "fileno", lambda: file_or_fd)() 25 | if not isinstance(fd, int): 26 | raise ValueError("Expected a file (`.fileno()`) or a file descriptor") 27 | return fd 28 | 29 | 30 | @contextmanager 31 | def stdout_redirected(to=os.devnull, stdout=None): 32 | if stdout is None: 33 | stdout = sys.stdout 34 | 35 | stdout_fd = fileno(stdout) 36 | # copy stdout_fd before it is overwritten 37 | # NOTE: `copied` is inheritable on Windows when duplicating a standard stream 38 | with os.fdopen(os.dup(stdout_fd), "wb") as copied: 39 | stdout.flush() # flush library buffers that dup2 knows nothing about 40 | try: 41 | os.dup2(fileno(to), stdout_fd) # $ exec >&to 42 | except ValueError: # filename 43 | with open(to, "wb") as to_file: 44 | os.dup2(to_file.fileno(), stdout_fd) # $ exec > to 45 | try: 46 | yield stdout # allow code to be run with the redirected stdout 47 | finally: 48 | # restore stdout to its previous value 49 | # NOTE: dup2 makes stdout_fd inheritable unconditionally 50 | stdout.flush() 51 | os.dup2(copied.fileno(), stdout_fd) # $ exec >&copied 52 | 53 | 54 | def slice_iterator(data, slice_len): 55 | it = iter(data) 56 | while True: 57 | items = [] 58 | for index in range(slice_len): 59 | try: 60 | item = next(it) 61 | except StopIteration: 62 | if items == []: 63 | return # we are done 64 | else: 65 | break # exits the "for" loop 66 | items.append(item) 67 | yield items 68 | 69 | 70 | def is_number(s): 71 | try: 72 | if s is not None: 73 | float(s) 74 | return True 75 | except ValueError: 76 | return False 77 | return False 78 | 79 | 80 | def eprint(*args, **kwargs): 81 | print(*args, file=sys.stderr, **kwargs) 82 | 83 | 84 | def read_hashes(): 85 | eprint("Reading hashes from stdin ...") 86 | hashes = list() 87 | try: 88 | while True: 89 | line = sys.stdin.readline().split() 90 | if len(line) == 0: 91 | return hashes 92 | hashes.extend(line) 93 | except KeyboardInterrupt: 94 | return hashes 95 | return hashes 96 | 97 | 98 | def confirm(prompt="Confirm", resp=False): 99 | """ 100 | prompts for yes or no response from the user. Returns True for yes and False for no. 101 | 'resp' should be set to the default value assumed by the caller when user simply types ENTER. 102 | """ 103 | if resp: 104 | prompt = "%s [%s]|%s: " % (prompt, "y", "n") 105 | else: 106 | prompt = "%s [%s]|%s: " % (prompt, "n", "y") 107 | 108 | while True: 109 | ans = "z" 110 | try: 111 | ans = input(prompt) 112 | except EOFError: 113 | # This hack is for OSX and Linux only 114 | # There EOFError occurs when hashes were read from stdin before 115 | # Reopening stdin in order to facilitate subsequent user input: 116 | sys.stdin = open("/dev/tty", mode="r") 117 | ans = input() 118 | if not ans: 119 | return resp 120 | if ans not in ["y", "Y", "n", "N"]: 121 | print("please enter y or n.") 122 | continue 123 | if ans == "y" or ans == "Y": 124 | return True 125 | if ans == "n" or ans == "N": 126 | return False 127 | -------------------------------------------------------------------------------- /gbd_core/query.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | 16 | from gbd_core.database import Database, DatabaseException 17 | from gbd_core.grammar import Parser 18 | from gbd_core import contexts 19 | from gbd_core.schema import Schema 20 | 21 | 22 | class GBDQuery: 23 | def __init__(self, db: Database, query): 24 | self.db = db 25 | self.parser = Parser(query) 26 | self.features = self.parser.get_features() 27 | 28 | def features_exist_or_throw(self, features): 29 | for feature in features: 30 | self.db.find(feature) 31 | 32 | # Generate SQL Query from given GBD Query 33 | def build_query(self, hashes=[], resolve=[], group_by=None, join_type="LEFT", collapse=None): 34 | group = group_by or self.determine_group_by(resolve) 35 | 36 | self.features_exist_or_throw(resolve + [group] + list(self.features)) 37 | 38 | sql_select = self.build_select(group, resolve, collapse) 39 | 40 | sql_from = self.build_from(group, set(resolve) | self.features, join_type) 41 | 42 | sql_where = self.build_where(hashes, group) 43 | 44 | sql_groupby = "GROUP BY {}".format(self.db.faddr(group)) if collapse else "" 45 | sql_orderby = "ORDER BY {}".format(self.db.faddr(group)) 46 | 47 | return "{} {} WHERE {} {} {}".format(sql_select, sql_from, sql_where, sql_groupby, sql_orderby) 48 | 49 | def determine_group_by(self, resolve): 50 | if len(resolve) == 0: 51 | return self.db.dcontext(self.db.find("hash").database) + ":hash" 52 | else: 53 | return self.db.dcontext(self.db.find(resolve[0]).database) + ":hash" 54 | 55 | def build_select(self, group_by, resolve, collapse=None): 56 | result = [self.db.faddr(f) for f in [group_by] + resolve] 57 | if collapse and collapse != "none": 58 | result = ["{}(DISTINCT {})".format(collapse, r) for r in result] 59 | return "SELECT DISTINCT " + ", ".join(result) 60 | 61 | def find_translator_feature(self, source_context, target_context): 62 | for dbname in self.db.get_databases(source_context): 63 | # eprint("Checking database {} for translator".format(dbname)) 64 | if "to_" + target_context in self.db.get_features([dbname]): 65 | return self.db.find("to_" + target_context, dbname) 66 | 67 | for dbname in self.db.get_databases(target_context): 68 | # eprint("Checking database {} for translator".format(dbname)) 69 | if "to_" + source_context in self.db.get_features([dbname]): 70 | return self.db.find("to_" + source_context, dbname) 71 | 72 | raise DatabaseException("No translator feature found for contexts {} and {}".format(source_context, target_context)) 73 | 74 | def build_from(self, group, features, join_type="LEFT"): 75 | result = dict() 76 | 77 | gdatabase = self.db.find(group).database 78 | gtable = self.db.find(group).table 79 | gcontext = self.db.dcontext(gdatabase) 80 | gaddress = gdatabase + "." + gtable 81 | result[gaddress] = "FROM {}".format(gaddress) 82 | 83 | tables = set([(finfo.database, finfo.table) for finfo in [self.db.find(f) for f in features]]) 84 | for fdatabase, ftable in tables: 85 | faddress = fdatabase + "." + ftable 86 | ffeatures_address = fdatabase + ".features" 87 | if not faddress in result: # join only once 88 | fcontext = self.db.dcontext(fdatabase) 89 | if fcontext == gcontext: 90 | if faddress == ffeatures_address: # join features table directly 91 | result[faddress] = "{j} JOIN {t} ON {t}.hash = {g}.hash".format(j=join_type, t=ffeatures_address, g=gaddress) 92 | else: # join non-unique features table via features table 93 | fname = ftable 94 | if not ffeatures_address in result: 95 | result[ffeatures_address] = "{j} JOIN {t} ON {t}.hash = {g}.hash".format(j=join_type, t=ffeatures_address, g=gaddress) 96 | result[faddress] = "{j} JOIN {t} ON {t}.hash = {ft}.{n}".format(j=join_type, t=faddress, ft=ffeatures_address, n=fname) 97 | else: 98 | tfeat = self.find_translator_feature(gcontext, fcontext) 99 | direction = ("hash", "value") if self.db.dcontext(tfeat.database) == gcontext else ("value", "hash") 100 | 101 | taddress = tfeat.database + "." + tfeat.table 102 | if not taddress in result: 103 | result[taddress] = "INNER JOIN {trans} ON {group}.hash = {trans}.{dir0}".format(trans=taddress, group=gaddress, dir0=direction[0]) 104 | 105 | result[faddress] = "INNER JOIN {feat} ON {trans}.{dir1} = {feat}.hash".format(feat=faddress, trans=taddress, dir1=direction[1]) 106 | 107 | return " ".join(result.values()) 108 | 109 | def build_where(self, hashes, group_by): 110 | group_column = self.db.faddr(group_by) 111 | group_table = self.db.faddr_table(group_by) 112 | result = group_column + " != 'None' AND " + self.parser.get_sql(self.db) 113 | if len(hashes): 114 | result = result + " AND {}.hash in ('{}')".format(group_table, "', '".join(hashes)) 115 | return result 116 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import unittest 4 | import sqlite3 5 | import polars as pl 6 | 7 | from gbd_core.api import GBD, GBDException 8 | from gbd_core.schema import Schema 9 | 10 | from tests import util 11 | 12 | class APITestCase(unittest.TestCase): 13 | 14 | def setUp(self) -> None: 15 | self.file1 = util.get_random_unique_filename('test1', '.db') 16 | self.file2 = util.get_random_unique_filename('test2', '.db') 17 | sqlite3.connect(self.file1).close() 18 | sqlite3.connect(self.file2).close() 19 | self.name1 = Schema.dbname_from_path(self.file1) 20 | self.name2 = Schema.dbname_from_path(self.file2) 21 | self.api = GBD([self.file1, self.file2]) 22 | return super().setUp() 23 | 24 | def tearDown(self) -> None: 25 | if os.path.exists(self.file1): 26 | os.remove(self.file1) 27 | if os.path.exists(self.file2): 28 | os.remove(self.file2) 29 | return super().tearDown() 30 | 31 | def test_databases_exist(self): 32 | self.assertEquals(self.api.get_databases(), [ self.name1, self.name2 ]) 33 | self.assertEquals(self.api.get_database_path(self.name1), self.file1) 34 | self.assertEquals(self.api.get_database_path(self.name2), self.file2) 35 | 36 | def test_create_feature(self): 37 | self.api.create_feature("A", None, self.name1) 38 | self.assertTrue(self.api.feature_exists("A")) 39 | self.api.create_feature("A", None, self.name2) 40 | api2 = GBD([self.file2]) 41 | self.assertTrue(api2.feature_exists("A")) 42 | with self.assertRaises(GBDException): 43 | self.api.create_feature("A", None, self.name1) 44 | with self.assertRaises(GBDException): 45 | self.api.create_feature("A", None, self.name2) 46 | 47 | def test_delete_feature(self): 48 | self.api.create_feature("A", None, self.name1) 49 | self.api.create_feature("A", None, self.name2) 50 | self.api.delete_feature("A", self.name1) 51 | self.assertFalse(self.api.feature_exists("A", self.name1)) 52 | self.assertTrue(self.api.feature_exists("A")) 53 | self.assertTrue(self.api.feature_exists("A", self.name2)) 54 | self.api.delete_feature("A") 55 | self.assertFalse(self.api.feature_exists("A")) 56 | 57 | def test_rename_feature(self): 58 | self.api.create_feature("A", None, self.name1) 59 | self.api.create_feature("B", None, self.name1) 60 | self.api.create_feature("A", None, self.name2) 61 | self.api.rename_feature("A", "B", self.name2) 62 | self.assertFalse(self.api.feature_exists("A", self.name2)) 63 | self.assertTrue(self.api.feature_exists("B", self.name2)) 64 | self.assertTrue(self.api.feature_exists("A", self.name1)) 65 | self.assertTrue(self.api.feature_exists("B", self.name1)) 66 | with self.assertRaises(GBDException): 67 | self.api.rename_feature("A", "B", self.name1) 68 | 69 | def test_set_values(self): 70 | self.api.create_feature("A", None, self.name1) # feature is multi-valued 71 | self.api.create_feature("B", "empty", self.name1) # feature has default value 72 | self.api.create_feature("A", "empty", self.name2) # shadowed feature 73 | # value1 (set values, default values emerge) 74 | self.api.set_values("A", "value1", [ str(i) for i in range(100) ], self.name1) 75 | df: pl.DataFrame = self.api.query("A = value1", resolve=["A", "B"]) 76 | self.assertCountEqual(df['hash'].to_list(), [ str(i) for i in range(100) ]) 77 | self.assertCountEqual(df['A'].to_list(), [ "value1" for _ in range(100) ]) 78 | self.assertCountEqual(df['B'].to_list(), [ "empty" for _ in range(100) ]) 79 | # value2 (set values, feature is multi-valued) 80 | self.api.set_values("A", "value2", [ str(i) for i in range(50) ], self.name1) 81 | df: pl.DataFrame = self.api.query("A = value1 or A = value2", resolve=["A"], collapse=None) 82 | self.assertCountEqual(df['A'].to_list(), [ "value2" for _ in range(50) ] + [ "value1" for _ in range(100) ]) 83 | # value3 (set values of shadowed feature by specifying target-database) 84 | self.api.set_values("A", "value3", [ str(i) for i in range(50) ], self.name2) 85 | df: pl.DataFrame = self.api.query("A = value1 or A = value2", resolve=["A"], collapse=None) 86 | self.assertCountEqual(df['A'].to_list(), [ "value2" for _ in range(50) ] + [ "value1" for _ in range(100) ]) 87 | self.api.database.commit() 88 | api2 = GBD([self.file2]) 89 | df: pl.DataFrame = api2.query("A = value3", resolve=["A"]) 90 | self.assertCountEqual(df["A"].to_list(), [ "value3" for _ in range(50) ]) 91 | 92 | def test_reset_values(self): 93 | self.api.create_feature("A", None, self.name1) 94 | self.api.create_feature("B", "empty", self.name1) 95 | self.api.create_feature("A", "empty", self.name2) 96 | self.api.set_values("A", "value1", [ str(i) for i in range(100) ], self.name1) 97 | self.api.set_values("A", "value2", [ str(i) for i in range(100) ], self.name1) 98 | self.api.set_values("B", "value3", [ str(i) for i in range(100) ], self.name1) 99 | self.api.set_values("A", "value1", [ str(i) for i in range(100) ], self.name2) 100 | # reset values in A 101 | self.api.reset_values("A", [ "value1" ], [ str(i) for i in range(50) ], self.name1) 102 | df: pl.DataFrame = self.api.query(None, hashes=[ str(i) for i in range(100) ], resolve=["A"], collapse=None) 103 | self.assertCountEqual(df['A'].to_list(), [ "value1" for _ in range(50) ] + [ "value2" for _ in range(100) ]) 104 | # reset values in B 105 | self.api.reset_values("B", [ "value3" ], [ str(i) for i in range(50) ], self.name1) 106 | df: pl.DataFrame = self.api.query(None, hashes=[ str(i) for i in range(100) ], resolve=["B"]) 107 | self.assertCountEqual(df['B'].to_list(), [ "value3" for _ in range(50) ] + [ "empty" for _ in range(50) ]) 108 | # reset values in shadowed A 109 | self.api.database.verbose = True 110 | self.api.reset_values("A", [ "value1" ], [ str(i) for i in range(50) ], self.name2) 111 | self.api.database.commit() 112 | api2 = GBD([self.file2]) 113 | df: pl.DataFrame = api2.query("A = value1", resolve=["A"]) 114 | self.assertCountEqual(df["A"].to_list(), [ "value1" for _ in range(50) ]) -------------------------------------------------------------------------------- /gbd_init/instance_transformers.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | 16 | import os 17 | import polars as pl 18 | from functools import reduce 19 | 20 | from gbd_core import contexts 21 | from gbd_core.api import GBD, GBDException 22 | from gbd_core import util 23 | 24 | from gbd_core.contexts import identify 25 | from gbd_init.initializer import Initializer, InitializerException 26 | 27 | try: 28 | from gbdc import cnf2kis, sanitise, normalise 29 | except ImportError: 30 | 31 | def cnf2kis(ipath, opath): 32 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 33 | 34 | def sanitise(ipath, opath): 35 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 36 | 37 | def normalise(ipath, opath): 38 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 39 | 40 | 41 | def kis_filename(path): 42 | kispath = reduce(lambda path, suffix: path[: -len(suffix)] if path.endswith(suffix) else path, contexts.suffixes("cnf"), path) 43 | return kispath + ".kis" 44 | 45 | 46 | def sanitised_filename(path): 47 | sanpath = reduce(lambda path, suffix: path[: -len(suffix)] if path.endswith(suffix) else path, contexts.suffixes("cnf"), path) 48 | return sanpath + ".sanitized.cnf" 49 | 50 | 51 | def normalised_filename(path): 52 | normpath = reduce(lambda path, suffix: path[: -len(suffix)] if path.endswith(suffix) else path, contexts.suffixes("cnf"), path) 53 | return normpath + ".normalised.cnf" 54 | 55 | 56 | def wrap_cnf2kis(hash, path, limits): 57 | kispath = kis_filename(path) 58 | util.eprint("Transforming {} to k-ISP {}".format(path, kispath)) 59 | try: 60 | result = cnf2kis(path, kispath) 61 | if "local" in result: 62 | kishash = result["hash"] 63 | return [ 64 | ("local", kishash, result["local"]), 65 | ("to_cnf", kishash, hash), 66 | ("nodes", kishash, result["nodes"]), 67 | ("edges", kishash, result["edges"]), 68 | ("k", kishash, result["k"]), 69 | ] 70 | else: 71 | raise GBDException("CNF2KIS failed for {} due to {}".format(path, result["hash"])) 72 | except Exception as e: 73 | util.eprint(str(e)) 74 | if os.path.exists(kispath): 75 | os.remove(kispath) 76 | 77 | return [] 78 | 79 | 80 | def wrap_sanitise(hash, path, limits): 81 | sanpath = sanitised_filename(path) 82 | util.eprint("Sanitising {}".format(path)) 83 | try: 84 | with open(sanpath, "w") as f, util.stdout_redirected(f): 85 | result = sanitise(path, sanpath) 86 | if "local" in result: 87 | sanhash = result["hash"] 88 | return [("local", sanhash, result["local"]), ("to_cnf", sanhash, hash)] 89 | else: 90 | raise GBDException("Sanitization failed for {}".format(path)) 91 | except Exception as e: 92 | util.eprint(str(e)) 93 | if os.path.exists(sanpath): 94 | os.remove(sanpath) 95 | 96 | return [] 97 | 98 | 99 | def wrap_normalise(hash, path, limits): 100 | normpath = normalised_filename(path) 101 | util.eprint("Normalising {}".format(path)) 102 | try: 103 | with open(normpath, "w") as f, util.stdout_redirected(f): 104 | result = normalise(path, normpath) 105 | normhash = result["hash"] 106 | if "local" in result and hash == normhash: 107 | return [("local", normhash, result["local"])] 108 | else: 109 | raise GBDException("Normalisation failed for {}".format(path)) 110 | except Exception as e: 111 | util.eprint(str(e)) 112 | if os.path.exists(normpath): 113 | os.remove(normpath) 114 | 115 | return [] 116 | 117 | 118 | def transform_instances_generic(key: str, api: GBD, rlimits, query, hashes, target_db, source, collapse=None): 119 | einfo = generic_transformers[key] 120 | context = api.database.dcontext(target_db) 121 | if not context in einfo["target"]: 122 | raise InitializerException("Target database context must be in {}".format(einfo["target"])) 123 | if not source in einfo["source"]: 124 | raise InitializerException("Source database context must be in {}".format(einfo["source"])) 125 | transformer = Initializer(api, rlimits, target_db, einfo["features"], einfo["compute"]) 126 | transformer.create_features() 127 | 128 | def path_exists(p): 129 | return p is not None and os.path.exists(einfo["filename"](p)) 130 | 131 | df: pl.DataFrame = api.query(query, hashes, [source + ":local"], collapse=collapse) 132 | missing = df.with_columns( 133 | exists=pl.col("local").map_elements( 134 | path_exists, 135 | return_dtype=pl.Boolean 136 | ) 137 | ).filter(~pl.col("exists")) 138 | 139 | transformer.run(missing) 140 | 141 | 142 | generic_transformers = { 143 | "sanitise": { 144 | "description": "Sanitise CNF files. ", 145 | "source": ["cnf"], 146 | "target": ["sancnf"], 147 | "features": [("local", None), ("to_cnf", None)], 148 | "compute": wrap_sanitise, 149 | "filename": sanitised_filename, 150 | }, 151 | "normalise": { 152 | "description": "Normalise CNF files. ", 153 | "source": ["cnf"], 154 | "target": ["cnf"], 155 | "features": [("local", None)], 156 | "compute": wrap_normalise, 157 | "filename": normalised_filename, 158 | }, 159 | "cnf2kis": { 160 | "description": "Transform CNF files to k-ISP instances. ", 161 | "source": ["cnf"], 162 | "target": ["kis"], 163 | "features": [("local", None), ("to_cnf", None), ("nodes", "empty"), ("edges", "empty"), ("k", "empty")], 164 | "compute": wrap_cnf2kis, 165 | "filename": kis_filename, 166 | }, 167 | } 168 | -------------------------------------------------------------------------------- /gbd_server/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Global Benchmark Database 6 | 7 | 10 | 11 | 12 | 13 |
14 |
15 | GBD-Logo 16 |
17 |
18 |
19 | Select a context 20 | 21 | 22 | {% for c in contexts %} 23 | 30 | {% endfor %} 31 |
32 |
33 |
34 |
35 | Select a context-specific database 36 | 37 | 38 | {% for dbname in databases %} 39 | 46 | {% endfor %} 47 |
48 |
49 |
50 |
51 |
52 | Query for Instances 53 | 54 | 55 | 56 | 57 |
58 |
59 |
60 | {% if error is not none %} 61 | {{ error }} 62 | {% endif %} 63 |
64 |
65 |
66 |
67 |
68 | Quickstart 69 |
    70 |
  • Query for Instances: 71 | Simple queries are constraints of the form "feature operator value", e.g., track=main_2024. More complex queries can be formulated as specified in our SAT 2024 paper. 72 |
  • 73 |
  • Download Instances: 74 | The file {{ query_name }}.uri contains the download links for the selected instances. Use it to download the instances, e.g., with wget like this
    wget --content-disposition -i {{ query_name }}.uri. 75 |
  • 76 |
  • Download the selected database: 77 | {{ selected }} 78 |
  • 79 |
80 |
81 |
82 |
83 | 84 |
85 | 86 | 87 | 88 | {% for feature in features %} 89 | 90 | {% endfor %} 91 | 92 | 93 | {% for row in result %} 94 | 95 | {% for item in row %} 96 | {% if loop.index == 1 %} 97 | 98 | {% elif item is link_field %} 99 | 100 | {% elif item is int_field %} 101 | 102 | {% elif item is num_field %} 103 | 104 | {% else %} 105 | 106 | {% endif %} 107 | {% endfor %} 108 | 109 | {% endfor %} 110 |
hash{{ feature }}
{{ item }}{{ item }}{{ item }}{{ '%0.2f'| format(item|float) }}{{ item }}
111 |
112 | 113 |
114 |
115 | 116 | {% if pages > 0 %} 117 | Found {{ total }} Instances. Select page: 118 | {% for i in range(0, pages) %} 119 | {% if i < 2 or i > pages - 3 or (i > page - 3 and i < page + 3) %} 120 | {% if i == page %} 121 | 122 | {% else %} 123 | 124 | {% endif %} 125 | {% elif i == 3 or i == pages - 3 %} 126 | ... 127 | {% endif %} 128 | {% endfor %} 129 | {% else %} 130 | Found {{ total }} instances. 131 | {% endif %} 132 |
133 |
134 | If you find this useful, please cite Global Benchmark Database (Iser & Jabs, SAT 2024) 135 |
136 |
137 |
138 | 139 | 140 | -------------------------------------------------------------------------------- /gbd_core/grammar.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import tatsu 16 | import json 17 | 18 | from gbd_core.database import Database, DatabaseException 19 | 20 | 21 | class ParserException(Exception): 22 | pass 23 | 24 | 25 | class Parser: 26 | GRAMMAR = r""" 27 | @@grammar::GBDQuery 28 | @@ignorecase::True 29 | 30 | start 31 | = 32 | q:query $ 33 | ; 34 | 35 | query 36 | = 37 | | left:query qop:("and" | "or") ~ right:query 38 | | qop:("not") ~ q:query 39 | | constraint 40 | | "(" q:query ")" 41 | ; 42 | 43 | constraint 44 | = 45 | | col:(dbname ":" column | column) cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) ter:termstart 46 | | col:(dbname ":" column | column) cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) num:number 47 | | col:(dbname ":" column | column) cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) str:string 48 | | col:(dbname ":" column | column) cop:("like" | "unlike") ~ lik:(["%"] string ["%"]) 49 | ; 50 | 51 | termstart 52 | = 53 | ("(") t:term (")") 54 | ; 55 | 56 | term 57 | = 58 | | left:term top:("+" | "-" | "*" | "/") right:term 59 | | ("(") t:term (")") 60 | | constant:number 61 | | col:(dbname ":" column | column) 62 | ; 63 | 64 | string 65 | = 66 | | "'" @:singlequotedstring "'" 67 | | '"' @:doublequotedstring '"' 68 | | /[a-zA-Z0-9_\.\-\/\,\:\+\=\@]+/ 69 | ; 70 | 71 | # number = /[-]?[0-9]+[.]?[0-9]*/ ; 72 | number = /[-]?[0-9]+(?:\.[0-9]+)?(?![A-Za-z0-9_])/ ; 73 | singlequotedstring = /[a-zA-Z0-9_\.\-\/\,\:\+\=\@\s"\*\\]+/ ; 74 | doublequotedstring = /[a-zA-Z0-9_\.\-\/\,\:\+\=\@\s'\*\\]+/ ; 75 | column = /[a-zA-Z][a-zA-Z0-9_]*/ ; 76 | dbname = /[a-zA-Z][a-zA-Z0-9_]*/ ; 77 | """ 78 | 79 | model = tatsu.compile(GRAMMAR) 80 | 81 | def __init__(self, query, verbose=False): 82 | try: 83 | self.ast = Parser.model.parse(query) if query else dict() 84 | if verbose: 85 | print("Parsed: " + query) 86 | print(json.dumps(tatsu.util.asjson(self.ast), indent=2)) 87 | except tatsu.exceptions.FailedParse as e: 88 | raise ParserException(f"Failed to parse query: {str(e)}") from e 89 | 90 | def get_features(self, ast=None): 91 | # import pprint 92 | # pp = pprint.PrettyPrinter(depth=6) 93 | # pp.pprint(ast) 94 | try: 95 | ast = ast if ast else self.ast 96 | if "q" in ast: 97 | return self.get_features(ast["q"]) 98 | elif "t" in ast: 99 | return self.get_features(ast["t"]) 100 | elif "qop" in ast or "top" in ast: 101 | return self.get_features(ast["left"]) | self.get_features(ast["right"]) 102 | elif "cop" in ast and "ter" in ast: 103 | return {"".join(ast["col"])} | self.get_features(ast["ter"]) 104 | elif "col" in ast: 105 | return {"".join(ast["col"])} 106 | else: 107 | return set() 108 | except TypeError as e: 109 | raise ParserException(f"Failed to parse query: {str(e)}") from e 110 | 111 | def get_sql(self, db: Database, ast=None): 112 | try: 113 | ast = ast if ast else self.ast 114 | if "qop" in ast and ast["qop"] == "not": 115 | return "NOT (" + self.get_sql(db, ast["q"]) + ")" 116 | if "q" in ast: 117 | return "(" + self.get_sql(db, ast["q"]) + ")" 118 | if "t" in ast: 119 | return "(" + self.get_sql(db, ast["t"]) + ")" 120 | if "qop" in ast or "top" in ast: # query operator or term operator 121 | operator = ast["qop"] if ast["qop"] else ast["top"] 122 | left = self.get_sql(db, ast["left"]) 123 | right = self.get_sql(db, ast["right"]) 124 | return f"{left} {operator} {right}" 125 | if "cop" in ast: # constraint operator 126 | operator = "not like" if ast["cop"] == "unlike" else ast["cop"] 127 | feat = db.faddr("".join(ast["col"])) 128 | feat_is_1_n = db.find("".join(ast["col"])).default is None 129 | if "str" in ast: # cop:("=" | "!=") 130 | if feat_is_1_n: 131 | table = db.faddr_table("".join(ast["col"])) 132 | setop = "IN" if ast["cop"] == "=" else "NOT IN" 133 | return "{t}.hash {o} (SELECT {t}.hash FROM {t} WHERE {f} = '{s}')".format(o=setop, t=table, f=feat, s=ast["str"]) 134 | return f"{feat} {operator} '{ast['str']}'" 135 | if "num" in ast: # cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) 136 | if feat_is_1_n: 137 | table = db.faddr_table("".join(ast["col"])) 138 | return "{t}.hash IN (SELECT {t}.hash FROM {t} WHERE CAST({f} AS FLOAT) {o} {s})".format(o=operator, t=table, f=feat, s=ast["num"]) 139 | return f"CAST({feat} AS FLOAT) {operator} {ast['num']}" 140 | if "lik" in ast: # cop:("like" | "unlike") 141 | if feat_is_1_n: 142 | table = db.faddr_table("".join(ast["col"])) 143 | setop = "IN" if ast["cop"] == "like" else "NOT IN" 144 | return "{t}.hash {o} (SELECT {t}.hash FROM {t} WHERE {f} like '{s}')".format( 145 | o=setop, t=table, f=feat, s="".join([t for t in ast["lik"] if t]) 146 | ) 147 | return f"{feat} {operator} '{''.join([t for t in ast['lik'] if t])}'" 148 | if "ter" in ast: # cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) 149 | if feat_is_1_n and ast["cop"] == "!=": 150 | table = db.faddr_table("".join(ast["col"])) 151 | setop = "NOT IN" if ast["cop"] == "!=" else "IN" 152 | cop = "=" if ast["cop"] == "!=" else ast["cop"] 153 | return "{t}.hash {o} (SELECT {t}.hash FROM {t} WHERE CAST({f} AS FLOAT) {c} {s})".format( 154 | o=setop, c=cop, t=table, f=feat, s=self.get_sql(db, ast["ter"]) 155 | ) 156 | return f"CAST({feat} AS FLOAT) {operator} {self.get_sql(db, ast['ter'])}" 157 | raise ParserException("Missing right-hand side of constraint") 158 | if "col" in ast: 159 | feature = db.faddr("".join(ast["col"])) 160 | return f"CAST({feature} AS FLOAT)" 161 | if "constant" in ast: 162 | return ast["constant"] 163 | return "1=1" 164 | except TypeError as e: 165 | raise ParserException(f"Failed to parse query: {str(e)}") from e 166 | except DatabaseException as e: 167 | raise ParserException(f"Failed to parse query: {str(e)}") from e 168 | -------------------------------------------------------------------------------- /gbd_init/feature_extractors.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import os 16 | import glob 17 | import warnings 18 | import polars as pl 19 | 20 | from gbd_core.contexts import suffixes, identify, get_context_by_suffix 21 | from gbd_core.api import GBD, GBDException 22 | from gbd_core.util import eprint, confirm 23 | from gbd_init.initializer import Initializer, InitializerException 24 | 25 | try: 26 | from gbdc import ( 27 | extract_base_features, 28 | base_feature_names, 29 | extract_gate_features, 30 | gate_feature_names, 31 | isohash, 32 | wcnfisohash, 33 | wcnf_base_feature_names, 34 | extract_wcnf_base_features, 35 | opb_base_feature_names, 36 | extract_opb_base_features, 37 | checksani, 38 | checksani_feature_names, 39 | ) 40 | except ImportError: 41 | 42 | def extract_base_features(path, tlim, mlim): 43 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 44 | 45 | def base_feature_names(): 46 | return [] 47 | 48 | def extract_gate_features(path, tlim, mlim): 49 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 50 | 51 | def gate_feature_names(): 52 | return [] 53 | 54 | def isohash(path): 55 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 56 | 57 | def extract_wcnf_base_features(path, tlim, mlim): 58 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 59 | 60 | def wcnf_base_feature_names(): 61 | return [] 62 | 63 | def extract_opb_base_features(path, tlim, mlim): 64 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 65 | 66 | def opb_base_feature_names(): 67 | return [] 68 | 69 | def checksani(path, tlim, mlim): 70 | raise ModuleNotFoundError("gbdc not found", name="gbdc") 71 | 72 | def checksani_feature_names(): 73 | return [] 74 | 75 | 76 | ## GBDHash 77 | def compute_hash(hash, path, limits): 78 | eprint("Hashing {}".format(path)) 79 | hash = identify(path) 80 | return [("local", hash, path), ("filename", hash, os.path.basename(path))] 81 | 82 | 83 | ## ISOHash 84 | def compute_isohash(hash, path, limits): 85 | eprint("Computing ISOHash for {}".format(path)) 86 | context = get_context_by_suffix(path) 87 | if context == "wcnf": 88 | ihash = wcnfisohash(path) 89 | else: 90 | ihash = isohash(path) 91 | return [("isohash", hash, ihash)] 92 | 93 | 94 | ## Base Features 95 | def compute_base_features(hash, path, limits, tp=None): 96 | eprint("Extracting base features from {} {}".format(hash, path)) 97 | rec = extract_base_features(path, limits["tlim"], limits["mlim"]) 98 | return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()] 99 | 100 | 101 | ## Gate Features 102 | def compute_gate_features(hash, path, limits, tp=None): 103 | eprint("Extracting gate features from {} {}".format(hash, path)) 104 | rec = extract_gate_features(path, limits["tlim"], limits["mlim"]) 105 | return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()] 106 | 107 | 108 | ## WCNF Base Features 109 | def compute_wcnf_base_features(hash, path, limits, tp=None): 110 | eprint("Extracting WCNF base features from {} {}".format(hash, path)) 111 | rec = extract_wcnf_base_features(path, limits["tlim"], limits["mlim"]) 112 | return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()] 113 | 114 | 115 | ## OPB Base Features 116 | def compute_opb_base_features(hash, path, limits, tp=None): 117 | eprint("Extracting OPB base features from {} {}".format(hash, path)) 118 | rec = extract_opb_base_features(path, limits["tlim"], limits["mlim"]) 119 | return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()] 120 | 121 | 122 | ## SANI Features 123 | def compute_sani_features(hash, path, limits, tp=None): 124 | eprint("Extracting SANI features from {} {}".format(hash, path)) 125 | rec = checksani(path, limits["tlim"], limits["mlim"]) 126 | return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()] 127 | 128 | 129 | generic_extractors = { 130 | "base": { 131 | "description": "Extract base features from CNF files. ", 132 | "contexts": ["cnf", "sancnf"], 133 | "features": [(name, "empty") for name in base_feature_names()], 134 | "compute": compute_base_features, 135 | }, 136 | "checksani": { 137 | "description": "Extract sanitise status from CNF files. ", 138 | "contexts": ["cnf", "sancnf"], 139 | "features": [(name, "empty") for name in checksani_feature_names()], 140 | "compute": compute_sani_features, 141 | }, 142 | "gate": { 143 | "description": "Extract gate features from CNF files. ", 144 | "contexts": ["cnf", "sancnf"], 145 | "features": [(name, "empty") for name in gate_feature_names()], 146 | "compute": compute_gate_features, 147 | }, 148 | "isohash": { 149 | "description": "Compute ISOHash for CNF or WCNF files. ", 150 | "contexts": ["cnf", "wcnf", "sancnf"], 151 | "features": [("isohash", "empty")], 152 | "compute": compute_isohash, 153 | }, 154 | "wcnfbase": { 155 | "description": "Extract base features from WCNF files. ", 156 | "contexts": ["wcnf"], 157 | "features": [(name, "empty") for name in wcnf_base_feature_names()], 158 | "compute": compute_wcnf_base_features, 159 | }, 160 | "opbbase": { 161 | "description": "Extract base features from OPB files. ", 162 | "contexts": ["opb"], 163 | "features": [(name, "empty") for name in opb_base_feature_names()], 164 | "compute": compute_opb_base_features, 165 | }, 166 | } 167 | 168 | 169 | def init_features_generic(key: str, api: GBD, rlimits, df: pl.DataFrame, target_db): 170 | einfo = generic_extractors[key] 171 | context = api.database.dcontext(target_db) 172 | if not context in einfo["contexts"]: 173 | raise InitializerException("Target database context must be in {}".format(einfo["contexts"])) 174 | extractor = Initializer(api, rlimits, target_db, einfo["features"], einfo["compute"]) 175 | extractor.create_features() 176 | extractor.run(df) 177 | 178 | 179 | def init_local(api: GBD, rlimits, root, target_db): 180 | context = api.database.dcontext(target_db) 181 | 182 | features = [("local", None), ("filename", None)] 183 | extractor = Initializer(api, rlimits, target_db, features, compute_hash) 184 | extractor.create_features() 185 | 186 | # Cleanup stale entries 187 | df: pl.DataFrame = api.query(group_by=context + ":local", collapse=None) 188 | 189 | def path_exists(p): 190 | return p is not None and os.path.exists(p) 191 | 192 | missing = df.with_columns( 193 | exists=pl.col("local").map_elements( 194 | path_exists, 195 | return_dtype=pl.Boolean 196 | ) 197 | ).filter(~pl.col("exists")).select("local") 198 | 199 | if len(missing) and api.verbose: 200 | for path in missing["local"].to_list(): 201 | eprint(path) 202 | if len(missing) and confirm("{} files not found. Remove stale entries from local table?".format(len(missing))): 203 | api.reset_values("local", values=missing["local"].to_list()) 204 | 205 | # Create df with paths not yet in local table 206 | paths = [path for suffix in suffixes(context) for path in glob.iglob(root + "/**/*" + suffix, recursive=True)] 207 | df2 = pl.DataFrame([(None, path) for path in paths if path not in df["local"].to_list()], schema=["hash", "local"], orient="row") 208 | 209 | extractor.run(df2) 210 | -------------------------------------------------------------------------------- /gbd_server/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # MIT License 4 | 5 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 6 | 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | 17 | from logging.handlers import TimedRotatingFileHandler 18 | import os 19 | import re 20 | import polars as pl 21 | 22 | import flask 23 | import logging 24 | import waitress 25 | from werkzeug.middleware.proxy_fix import ProxyFix 26 | 27 | from gbd_core.database import DatabaseException 28 | from gbd_core.api import GBD, GBDException 29 | from gbd_core.grammar import ParserException 30 | from gbd_core.util import is_number 31 | from gbd_core import contexts 32 | 33 | app = flask.Flask(__name__) 34 | 35 | 36 | def request_query(request): 37 | query = "" 38 | if "query" in request.values: 39 | query = request.values.get("query") 40 | elif len(request.args) > 0: 41 | query = " and ".join(["{}={}".format(key, value) for (key, value) in request.args.items() if key != "context"]) 42 | return query 43 | 44 | 45 | def request_database(request): 46 | if "selected_db" in request.values and request.values.get("selected_db") in app.config["dbnames"]: 47 | dbname = request.values.get("selected_db") 48 | context = request_context(request) 49 | if dbname in [GBD.get_database_name(c) for c in app.config["contextdbs"][context]]: 50 | return dbname 51 | else: 52 | return GBD.get_database_name(app.config["contextdbs"][context][0]) 53 | else: 54 | return app.config["dbnames"][0] 55 | 56 | 57 | def request_page(request): 58 | return int(request.values.get("page")) if "page" in request.values else 0 59 | 60 | 61 | def request_action(request): 62 | return request.values.get("action") if "action" in request.values else "default" 63 | 64 | 65 | def request_context(request): 66 | return request.values.get("context") if "context" in request.values else contexts.default_context() 67 | 68 | 69 | def query_to_name(query): 70 | return re.sub(r"[^\w]", "_", query) if query else "allinstances" 71 | 72 | 73 | def error_response(msg, addr, errno=404): 74 | app.logger.error("{}: {}".format(addr, msg)) 75 | return flask.Response(msg, status=errno, mimetype="text/plain") 76 | 77 | 78 | def file_response(text_blob, filename, mimetype, addr): 79 | app.logger.info("{}: Sending generated file {}".format(addr, filename)) 80 | return flask.Response(text_blob, mimetype=mimetype, headers={"Content-Disposition": 'attachment; filename="{}"'.format(filename), "filename": filename}) 81 | 82 | 83 | def path_response(path, filename, mimetype, addr): 84 | app.logger.info("{}: Sending file {}".format(addr, path)) 85 | return flask.send_file(path, as_attachment=True, download_name=filename, mimetype=mimetype) 86 | 87 | 88 | def json_response(json_blob, msg, addr): 89 | app.logger.info("{}: {}".format(addr, msg)) 90 | return flask.Response(json_blob, status=200, mimetype="application/json") 91 | 92 | 93 | def page_response(context, query, database, page=0): 94 | with GBD(app.config["contextdbs"][context]) as gbd: 95 | start = page * 1000 96 | end = start + 1000 97 | error = None 98 | try: 99 | df: pl.DataFrame = gbd.query(query, resolve=["{}:{}".format(database, f) for f in app.config["features"][database]], collapse="GROUP_CONCAT") 100 | except GBDException as err: 101 | error = "GBDException: {}".format(str(err)) 102 | except DatabaseException as err: 103 | error = "DatabaseException: {}".format(str(err)) 104 | except ParserException as err: 105 | error = "ParserException: {}".format(str(err)) 106 | except Exception as err: 107 | error = "An Unhandled Exception Occurred" 108 | return flask.render_template( 109 | "index.html", 110 | context=context, 111 | error=error, 112 | contexts=app.config["contexts"], 113 | query=query, 114 | query_name=query_to_name(query), 115 | # result=df.iloc[start:end, :].values.tolist() if error is None else [], 116 | result=( 117 | [list(r) for r in df.slice(start, end - start).rows()] 118 | if error is None 119 | else [] 120 | ), 121 | total=len(df) if error is None else 0, 122 | page=page, 123 | pages=int(len(df) / 1000) + 1 if error is None else 0, 124 | selected=database, 125 | features=app.config["features"][database], 126 | databases=[gbd.get_database_name(db) for db in app.config["contextdbs"][context]], 127 | action=request_action(flask.request), 128 | ) 129 | 130 | 131 | # Returns main index page 132 | @app.route("/", methods=["POST", "GET"]) 133 | def quick_search(): 134 | context = request_context(flask.request) 135 | query = request_query(flask.request) 136 | database = request_database(flask.request) 137 | context_databases = [GBD.get_database_name(db) for db in app.config["contextdbs"][context]] 138 | if not database in context_databases: 139 | database = context_databases[0] 140 | page = request_page(flask.request) 141 | return page_response(context, query, database, page) 142 | 143 | 144 | # Generates a list of URLs. Given query (text field of POST form) is executed and the hashes of the result are resolved 145 | # against the filename feature. Every filename is associated with a URL to enable flexible downloading of these files 146 | @app.route("/getinstances/", methods=["POST", "GET"]) 147 | @app.route("/getinstances", methods=["POST", "GET"]) 148 | def get_url_file(): 149 | context = request_context(flask.request) 150 | with GBD(app.config["contextdbs"][context]) as gbd: 151 | query = request_query(flask.request) 152 | try: 153 | df: pl.DataFrame = gbd.query(query) 154 | except (GBDException, DatabaseException, ParserException) as err: 155 | return error_response("{}, {}".format(type(err), str(err)), flask.request.remote_addr, errno=500) 156 | if context == "cnf": 157 | content = "\n".join([flask.url_for("get_file", hashvalue=val, _external=True) for val in df["hash"].to_list()]) 158 | else: 159 | content = "\n".join([flask.url_for("get_file", hashvalue=val, context=context, _external=True) for val in df["hash"].to_list()]) 160 | return file_response(content, query_to_name(query) + ".uri", "text/uri-list", flask.request.remote_addr) 161 | 162 | 163 | # Send database file 164 | @app.route("/getdatabase/") 165 | @app.route("/getdatabase") 166 | @app.route("/getdatabase//") 167 | @app.route("/getdatabase/") 168 | def get_database_file(database=None): 169 | dbname = database if database and database in app.config["dbnames"] else app.config["dbnames"][0] 170 | dbpath = app.config["dbpaths"][dbname] 171 | return path_response(dbpath, os.path.basename(dbpath), "application/x-sqlite3", flask.request.remote_addr) 172 | 173 | 174 | # Find the file corresponding to the hashvalue and send it to the client 175 | @app.route("/file//") 176 | @app.route("/file/") 177 | def get_file(hashvalue): 178 | context = request_context(flask.request) 179 | print(context, app.config["contextdbs"][context]) 180 | with GBD(app.config["contextdbs"][context]) as gbd: 181 | df: pl.DataFrame = gbd.query(hashes=[hashvalue], resolve=["local", "filename"], collapse="MIN") 182 | if not len(df): 183 | return error_response("Hash '{}' not found".format(hashvalue), flask.request.remote_addr) 184 | row = df.to_dicts()[0] 185 | if not os.path.exists(row["local"]): 186 | return error_response("Files temporarily not accessible", flask.request.remote_addr) 187 | return path_response(row["local"], row["hash"] + "-" + row["filename"], "application/x-xz", flask.request.remote_addr) 188 | 189 | 190 | # start the server 191 | def serve(gbd: GBD, port: int = 5000, logdir: str = "/tmp"): 192 | formatter = logging.Formatter( 193 | fmt="[%(asctime)s, %(name)s, %(levelname)s] %(module)s.%(filename)s.%(funcName)s():%(lineno)d\n%(message)s", datefmt="%Y-%m-%d %H:%M:%S" 194 | ) 195 | logging.getLogger().setLevel(logging.DEBUG) 196 | # Add sys.stdout to logging output 197 | console_handler = logging.StreamHandler() 198 | console_handler.setFormatter(formatter) 199 | console_handler.setLevel(logging.INFO) 200 | logging.getLogger().addHandler(console_handler) 201 | # Add handler to write in rotating logging files 202 | file_handler = TimedRotatingFileHandler(logdir + "/trfile.log", when="midnight", backupCount=10) 203 | file_handler.setFormatter(formatter) 204 | file_handler.setLevel(logging.WARNING) 205 | logging.getLogger().addHandler(file_handler) 206 | 207 | global app 208 | app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1) 209 | 210 | app.jinja_env.trim_blocks = True 211 | app.jinja_env.lstrip_blocks = True 212 | 213 | app.jinja_env.tests["link_field"] = lambda field: field is not None and field.startswith("http") 214 | app.jinja_env.tests["num_field"] = lambda field: field is not None and is_number(field) 215 | app.jinja_env.tests["int_field"] = lambda field: field is not None and field.isnumeric() 216 | 217 | path = os.path.dirname(__file__) 218 | app.static_folder = os.path.join(path, "static") 219 | app.template_folder = os.path.join(path, "templates") 220 | 221 | app.config["contexts"] = gbd.get_contexts() 222 | app.config["dbnames"] = gbd.get_databases() 223 | # group databases by context 224 | app.config["contextdbs"] = dict() 225 | for ctxt in app.config["contexts"]: 226 | app.config["contextdbs"][ctxt] = [gbd.get_database_path(c) for c in gbd.get_databases(ctxt)] 227 | # group features by database 228 | app.config["dbpaths"] = dict() 229 | app.config["features"] = dict() 230 | for db in app.config["dbnames"]: 231 | app.config["features"][db] = [f for f in gbd.get_features(db) if not f in ["hash", "local"]] 232 | app.config["dbpaths"][db] = gbd.get_database_path(db) 233 | app.config["features_flat"] = [f for f in gbd.get_features() if not f in ["hash", "local"]] 234 | 235 | waitress.serve(app, host="0.0.0.0", port=port) 236 | -------------------------------------------------------------------------------- /gbd_core/api.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | 16 | import sqlite3 17 | import tatsu 18 | import polars as pl 19 | 20 | from contextlib import ExitStack 21 | import traceback 22 | 23 | from gbd_core.query import GBDQuery 24 | from gbd_core.database import Database 25 | from gbd_core.database import Schema 26 | from gbd_core import util 27 | 28 | 29 | class GBDException(Exception): 30 | pass 31 | 32 | 33 | class GBD: 34 | # Create a new GBD object which operates on the given databases 35 | def __init__(self, dbs: list, verbose: bool = False): 36 | assert isinstance(dbs, list) 37 | self.database = Database(dbs, verbose) 38 | self.verbose = verbose 39 | 40 | def __enter__(self): 41 | with ExitStack() as stack: 42 | stack.enter_context(self.database) 43 | self._stack = stack.pop_all() 44 | return self 45 | 46 | def __exit__(self, exc_type, exc, traceback): 47 | self._stack.__exit__(exc_type, exc, traceback) 48 | 49 | @classmethod 50 | def identify(cls, path): 51 | """Identify the given benchmark by its GBD hash 52 | 53 | Args: 54 | path (str): path to benchmark 55 | 56 | Returns: 57 | str: GBD hash 58 | """ 59 | from gbd_core.contexts import identify 60 | 61 | return identify(path) 62 | 63 | def query(self, gbd_query=None, hashes=[], resolve=[], collapse="group_concat", group_by=None, join_type="LEFT") -> pl.DataFrame: 64 | """Query the database 65 | 66 | Args: 67 | gbd_query (str): GBD query string 68 | hashes (list): list of hashes (=benchmark ids), the query is restricted to 69 | resolve (list): list of features to be resolved 70 | collapse (str): collapse function: min, max, avg, count, sum, group_concat, or none 71 | group_by (str): group results by that feature instead of hash (default) 72 | join_type (str): join type: left or inner 73 | 74 | Returns: 75 | polars.DataFrame: query result 76 | """ 77 | query_builder = GBDQuery(self.database, gbd_query) 78 | try: 79 | sql = query_builder.build_query(hashes, resolve, group_by, join_type, collapse) 80 | except tatsu.exceptions.FailedParse as err: 81 | if self.verbose: 82 | util.eprint(traceback.format_exc()) 83 | raise GBDException("Parser Error with Query '{}': {}".format(gbd_query, str(err))) 84 | try: 85 | result = self.database.query(sql) 86 | except sqlite3.OperationalError as err: 87 | if self.verbose: 88 | util.eprint(traceback.format_exc()) 89 | raise GBDException("Database Operational Error: {}".format(str(err))) 90 | group = group_by or query_builder.determine_group_by(resolve) 91 | cols = [p.split(":") for p in [group] + resolve] 92 | cols = [c[0] if len(c) == 1 else c[1] for c in cols] 93 | return pl.DataFrame(result, schema=cols, orient="row") 94 | 95 | def set_values(self, name, value, hashes, target_db=None): 96 | """Set feature value for given hashes 97 | 98 | Args: 99 | name (str): feature name 100 | value (str): value to be set 101 | hashes (list): list of hashes (=benchmark ids) 102 | target_db (str, optional): name of target database 103 | if None, default database (first in list) is used 104 | Raises: 105 | GBDException, if feature does not exist 106 | """ 107 | if not self.feature_exists(name, target_db): 108 | raise GBDException("Feature '{}' does not exist".format(name)) 109 | if not len(hashes): 110 | raise GBDException("No hashes given") 111 | self.database.set_values(name, value, hashes, target_db) 112 | 113 | def reset_values(self, feature, values=[], hashes=[], target_db=None): 114 | """Reset feature value for given hashes 115 | 116 | Args: 117 | feature (str): feature name 118 | values (list, optional): list of values to be reset 119 | hashes (list, optional): list of hashes (=benchmark ids) to be reset 120 | target_db (str, optional): name of target database 121 | if None, default database (first in list) is used 122 | 123 | Raises: 124 | GBDException, if feature does not exist 125 | """ 126 | if not self.feature_exists(feature, target_db): 127 | raise GBDException("Feature '{}' does not exist".format(feature)) 128 | if len(values) and len(hashes): 129 | for values_slice in util.slice_iterator(values, 10): 130 | for hashes_slice in util.slice_iterator(hashes, 10): 131 | self.database.delete(feature, values_slice, hashes_slice, target_db) 132 | self.database.commit() 133 | elif len(values): 134 | for values_slice in util.slice_iterator(values, 10): 135 | self.database.delete(feature, values_slice, [], target_db) 136 | self.database.commit() 137 | elif len(hashes): 138 | for hashes_slice in util.slice_iterator(hashes, 10): 139 | self.database.delete(feature, [], hashes_slice, target_db) 140 | self.database.commit() 141 | 142 | def delete_hashes(self, hashes, target_db=None): 143 | """Delete all values for given hashes 144 | 145 | Args: 146 | hashes (list): list of hashes (=benchmark ids) to be deleted 147 | target_db (str, optional): name of target database 148 | if None, default database (first in list) is used 149 | 150 | Raises: 151 | GBDException, if feature does not exist 152 | """ 153 | if not len(hashes): 154 | raise GBDException("No hashes given") 155 | self.database.delete_hashes_entirely(hashes, target_db) 156 | 157 | def get_databases(self, context=None): 158 | """Get list of database names 159 | 160 | Returns: list of database names 161 | """ 162 | if context is None: 163 | return list(self.database.get_databases()) 164 | else: 165 | return [db for db in self.database.get_databases() if self.database.dcontext(db) == context] 166 | 167 | def get_database_path(self, dbname): 168 | """Get path for given database name 169 | 170 | Args: 171 | dbname (str): name of database 172 | 173 | Returns: path to database 174 | """ 175 | return self.database.dpath(dbname) 176 | 177 | @classmethod 178 | def get_database_name(self, path): 179 | """Get database name for given path 180 | 181 | Args: 182 | path (str): path to database 183 | 184 | Returns: name of database 185 | """ 186 | return Schema.dbname_from_path(path) 187 | 188 | def get_contexts(self, dbs=[]): 189 | """Get list of contexts 190 | 191 | Returns: list of contexts 192 | """ 193 | if not len(dbs): 194 | return list(self.database.get_contexts()) 195 | else: 196 | return list(set([self.database.dcontext(db) for db in dbs])) 197 | 198 | def get_feature_info(self, fname): 199 | """Retrieve information about a specific feature""" 200 | finfo = self.database.find(fname) 201 | df: pl.DataFrame = self.query(resolve=[fname], collapse=None) 202 | 203 | min_value = sorted(pl.Series(df[fname]).to_list())[0] 204 | max_value = sorted(pl.Series(df[fname]).to_list(), reverse=True)[0] 205 | return { 206 | "feature": fname, 207 | "count": len(df), 208 | "default": finfo.default, 209 | "num-min": min_value, 210 | "num-max": max_value, 211 | "strings": " ".join(sorted([val for val in df[fname].unique() if val and not util.is_number(val)])), 212 | } 213 | 214 | def get_features(self, dbname: str = None): 215 | """Get features from the database. 216 | 217 | Args: 218 | dbname (str): name of feature database 219 | if None, feature list is accumulated over all databases 220 | 221 | Returns: list of features names 222 | """ 223 | lst = self.database.get_features([] if not dbname else [dbname]) 224 | while "hash" in lst: 225 | lst.remove("hash") 226 | return lst 227 | 228 | def feature_exists(self, name, dbname=None): 229 | """Check if feature exists in the database. 230 | 231 | Args: 232 | name (str): name of feature 233 | dbname (str): name of feature database 234 | if None, feature existence is checked for in all databases 235 | 236 | Returns: True if feature exists in dbname or any database, False otherwise 237 | """ 238 | return name in self.get_features(dbname) 239 | 240 | def create_feature(self, name: str, default_value: str = None, target_db: str = None): 241 | """Creates feature with given name 242 | 243 | Args: 244 | name (str): feature name 245 | default_value (str): default value for 1:1 features 246 | if None, a multi-valued (1:n) feature is created 247 | target_db (str): database name 248 | if None, default database (fist in list) is used 249 | 250 | Returns: None 251 | 252 | Raises: 253 | GBDException, if feature already exists in target_db 254 | """ 255 | if not self.feature_exists(name, target_db): 256 | self.database.create_feature(name, default_value, target_db, False) 257 | else: 258 | raise GBDException("Feature '{}' does already exist".format(name)) 259 | 260 | def delete_feature(self, name, target_db=None): 261 | """Deletes feature with given name 262 | 263 | Args: 264 | name (str): feature name 265 | target_db (str): database name 266 | if None, default database (fist in list) is used 267 | 268 | Returns: None 269 | 270 | Raises: 271 | GBDException, if feature does not exist in target_db 272 | """ 273 | if self.feature_exists(name, target_db): 274 | self.database.delete_feature(name, target_db) 275 | else: 276 | raise GBDException("Feature '{}' does not exist".format(name)) 277 | 278 | def rename_feature(self, old_name, new_name, target_db=None): 279 | """Renames feature with given name 280 | 281 | Args: 282 | old_name (str): old feature name 283 | new_name (str): new feature name 284 | target_db (str): database name 285 | if None, default database (fist in list) is used 286 | 287 | Returns: None 288 | 289 | Raises: 290 | GBDException, 291 | - if feature 'old_name' does not exist in target_db 292 | - if feature 'new_name' already exists in target_db 293 | """ 294 | if not self.feature_exists(old_name, target_db): 295 | raise GBDException("Feature '{}' does not exist".format(old_name)) 296 | elif self.feature_exists(new_name, target_db): 297 | raise GBDException("Feature '{}' does already exist".format(new_name)) 298 | else: 299 | self.database.rename_feature(old_name, new_name, target_db) 300 | 301 | def copy_feature(self, old_name, new_name, target_db=None, gbd_query=None, hashes=[]): 302 | """Copies feature with given name 303 | 304 | Args: 305 | old_name (str): old feature name 306 | new_name (str): new feature name 307 | target_db (str): name of database to copy feature to 308 | if None, default database (fist in list) is used 309 | 310 | Returns: None 311 | """ 312 | if not self.feature_exists(old_name): 313 | raise GBDException("Feature '{}' does not exist".format(old_name)) 314 | 315 | if not self.feature_exists(new_name, target_db): 316 | self.create_feature(new_name, target_db=target_db) 317 | 318 | hashes = self.query(gbd_query=gbd_query, hashes=hashes)["hash"].to_list() 319 | 320 | self.database.copy_feature(old_name, new_name, target_db, hashes) 321 | -------------------------------------------------------------------------------- /gbd_core/database.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import sqlite3 16 | import typing 17 | 18 | from pprint import pprint 19 | 20 | from gbd_core.util import eprint 21 | from gbd_core.schema import Schema, FeatureInfo 22 | from gbd_core import contexts 23 | 24 | 25 | class DatabaseException(Exception): 26 | pass 27 | 28 | 29 | class Database: 30 | def __init__(self, path_list: list, verbose=False, autocommit=True): 31 | self.verbose = verbose 32 | self.schemas = self.init_schemas(path_list) 33 | self.features = self.init_features() 34 | self.connection = sqlite3.connect("file::memory:?cache=shared", uri=True, timeout=10) 35 | self.cursor = self.connection.cursor() 36 | self.maindb = None 37 | self.autocommit = autocommit 38 | schema: Schema 39 | for schema in self.schemas.values(): 40 | if not schema.is_in_memory(): 41 | self.execute("ATTACH DATABASE '{}' AS {}".format(schema.path, schema.dbname)) 42 | else: 43 | self.execute("ATTACH DATABASE 'file:{}?mode=memory&cache=shared' AS {}".format(schema.dbname, schema.dbname)) 44 | # first database is the default database: 45 | if not self.maindb: 46 | self.maindb = schema.dbname 47 | 48 | def __enter__(self): 49 | return self 50 | 51 | def __exit__(self, exception_type, exception_value, traceback): 52 | self.connection.commit() 53 | self.connection.close() 54 | 55 | # returns major version of sqlite3 as float 56 | @classmethod 57 | def sqlite3_version(cls): 58 | return float(sqlite3.sqlite_version.rsplit(".", 1)[0]) 59 | 60 | def init_schemas(self, path_list) -> typing.Dict[str, Schema]: 61 | result = dict() 62 | for path in path_list: 63 | schema = Schema.create(path) 64 | if not schema.dbname in result: 65 | result[schema.dbname] = schema 66 | elif schema.is_in_memory(): 67 | result[schema.dbname].absorb(schema) 68 | else: 69 | raise DatabaseException("Database name collision on " + schema.dbname) 70 | return result 71 | 72 | # return a dictionary which maps feature names to feature infos 73 | def init_features(self) -> typing.Dict[str, FeatureInfo]: 74 | result = dict() 75 | schema: Schema 76 | for schema in self.schemas.values(): 77 | feature: FeatureInfo 78 | for feature in schema.features.values(): 79 | # first found feature is used: (=feature precedence by database position) 80 | if not feature.name in result: 81 | result[feature.name] = [feature] 82 | elif feature.column == "hash" and feature.table == "features": 83 | # first found features table is the one that serves the hash 84 | if result[feature.name][0].table != "features": 85 | result[feature.name].insert(0, feature) 86 | else: 87 | result[feature.name].append(feature) 88 | else: 89 | result[feature.name].append(feature) 90 | return result 91 | 92 | def query(self, q): 93 | if self.verbose: 94 | eprint(q) 95 | return self.cursor.execute(q).fetchall() 96 | 97 | def execute(self, q): 98 | if self.verbose: 99 | eprint(q) 100 | self.cursor.execute(q) 101 | if self.autocommit: 102 | self.commit() 103 | 104 | def commit(self): 105 | self.connection.commit() 106 | 107 | def set_auto_commit(self, autocommit): 108 | self.autocommit = autocommit 109 | 110 | def dexists(self, dbname): 111 | return dbname in self.schemas.keys() 112 | 113 | def dmain(self, dbname): 114 | return dbname == self.maindb 115 | 116 | def dpath(self, dbname): 117 | if not dbname in self.schemas: 118 | raise DatabaseException("Database '{}' not found".format(dbname)) 119 | return self.schemas[dbname].path 120 | 121 | def dcontext(self, dbname): 122 | if not dbname in self.schemas: 123 | raise DatabaseException("Database '{}' not found".format(dbname)) 124 | return self.schemas[dbname].context 125 | 126 | def dtables(self, dbname): 127 | if not dbname in self.schemas: 128 | raise DatabaseException("Database '{}' not found".format(dbname)) 129 | return self.schemas[dbname].get_tables() 130 | 131 | def finfo(self, fname, db=None): 132 | if fname in self.features and len(self.features[fname]) > 0: 133 | if db is None: 134 | return self.features[fname][0] 135 | else: 136 | infos = [info for info in self.features[fname] if info.database == db] 137 | if len(infos) == 0: 138 | raise DatabaseException("Feature '{}' does not exists in database {}".format(fname, db)) 139 | return infos[0] 140 | else: 141 | raise DatabaseException("Feature '{}' does not exists".format(fname)) 142 | 143 | def faddr_column(self, feature): 144 | finfo = self.find(feature) 145 | return "{}.{}.{}".format(finfo.database, finfo.table, finfo.column) 146 | 147 | def faddr_table(self, feature): 148 | finfo = self.find(feature) 149 | return "{}.{}".format(finfo.database, finfo.table) 150 | 151 | def find(self, fid: str, db: str = None): 152 | """Find feature by name or feature identifier 153 | 154 | Args: 155 | fid: feature identifier, of the form "database:feature", "context:feature" or "feature" 156 | db: database name (optional), if given fid is unique without database: or context: prefix 157 | 158 | Returns: 159 | FeatureInfo object: the info object for the first found feature 160 | feature precedence is according to the order of databases in the path list 161 | ambiguity can be resolved by using one of the following methods. 162 | - by giving a database name as the second argument or 163 | - by using the fid syntax "database:feature" 164 | - by using the fid syntax "context:feature" (note that this does not necessarily resolve all ambiguity) 165 | 166 | Raises: 167 | DatabaseException: if feature is not found or given database info is ambiguous 168 | """ 169 | parts = fid.split(":") 170 | if db is not None: 171 | if len(parts) > 1: 172 | if parts[0] != db: 173 | raise DatabaseException("Ambiguous database identifiers: '{}' and '{}'".format(parts[0], db)) 174 | else: 175 | return self.finfo(parts[1], parts[0]) 176 | return self.finfo(fid, db) 177 | elif len(parts) == 1: 178 | return self.finfo(fid) 179 | elif parts[0] in self.get_databases(): 180 | return self.finfo(parts[1], parts[0]) 181 | elif parts[0] in self.get_contexts(): 182 | db = self.get_databases(parts[0])[0] 183 | return self.finfo(parts[1], db) 184 | else: 185 | raise DatabaseException("Feature '{}' not found".format(fid)) 186 | 187 | def faddr(self, fid: str, with_column=True): 188 | finfo = self.find(fid) 189 | 190 | if with_column: 191 | return "{}.{}.{}".format(finfo.database, finfo.table, finfo.column) 192 | else: 193 | return "{}.{}".format(finfo.database, finfo.table) 194 | 195 | def get_databases(self, context: str = None): 196 | return [dbname for (dbname, schema) in self.schemas.items() if not context or context == schema.context] 197 | 198 | def get_contexts(self, dbs=[]): 199 | return list(set([s.context for s in self.schemas.values() if not dbs or s.dbname in dbs])) 200 | 201 | def get_features(self, dbs=[]): 202 | return [name for (name, infos) in self.features.items() for info in infos if not dbs or info.database in dbs] 203 | 204 | def get_tables(self, dbs=[]): 205 | tables = [info.table for infos in self.features.values() for info in infos if not dbs or info.database in dbs] 206 | return list(set(tables)) 207 | 208 | def create_feature(self, name, default_value=None, target_db=None, permissive=False): 209 | db = target_db or self.maindb 210 | created = self.schemas[db].create_feature(name, default_value, permissive) 211 | for finfo in created: 212 | if not finfo.name in self.features.keys(): 213 | self.features[finfo.name] = [finfo] 214 | else: 215 | # this code disregards feature precedence by database position: 216 | self.features[finfo.name].append(finfo) 217 | 218 | def set_values(self, fname, value, hashes, target_db=None): 219 | finfo = self.finfo(fname, target_db) 220 | self.schemas[finfo.database].set_values(fname, value, hashes) 221 | 222 | def rename_feature(self, fname, new_fname, target_db=None): 223 | Schema.valid_feature_or_raise(new_fname) 224 | finfo = self.finfo(fname, target_db) 225 | self.execute("ALTER TABLE {}.features RENAME COLUMN {} TO {}".format(finfo.database, fname, new_fname)) 226 | if finfo.default is None: 227 | con = sqlite3.connect(self.schemas[finfo.database].path) 228 | with con as cursor: 229 | cursor.execute("ALTER TABLE {} RENAME TO {}".format(fname, new_fname)) 230 | con.close() 231 | self.features[fname].remove(finfo) 232 | if not len(self.features[fname]): 233 | del self.features[fname] 234 | finfo.name = new_fname 235 | if not new_fname in self.features.keys(): 236 | self.features[new_fname] = [finfo] 237 | else: 238 | # this code disregards feature precedence by database position: 239 | self.features[new_fname].append(finfo) 240 | 241 | def delete_feature(self, fname, target_db=None): 242 | finfo = self.finfo(fname, target_db) 243 | if finfo.default is None: 244 | self.execute("DROP TABLE IF EXISTS {}.{}".format(finfo.database, fname)) 245 | elif Database.sqlite3_version() >= 3.35: 246 | self.execute("ALTER TABLE {}.{} DROP COLUMN {}".format(finfo.database, finfo.table, fname)) 247 | else: 248 | raise DatabaseException("Cannot delete unique feature {} with SQLite versions < 3.35".format(fname)) 249 | self.features[fname].remove(finfo) 250 | if not len(self.features[fname]): 251 | del self.features[fname] 252 | 253 | def delete(self, fname, values=[], hashes=[], target_db=None): 254 | finfo = self.finfo(fname, target_db) 255 | w1 = "{cl} IN ('{v}')".format(cl=finfo.column, v="', '".join(values)) 256 | w2 = "hash IN ('{h}')".format(h="', '".join(hashes)) 257 | where = "{} AND {}".format(w1 if len(values) else "1=1", w2 if len(hashes) else "1=1") 258 | db = finfo.database 259 | if finfo.default is None: 260 | hashlist = [r[0] for r in self.query("SELECT DISTINCT(hash) FROM {d}.{tab} WHERE {w}".format(d=db, tab=fname, w=where))] 261 | self.execute("DELETE FROM {d}.{tab} WHERE {w}".format(d=db, tab=fname, w=where)) 262 | remaining = [ 263 | r[0] for r in self.query("SELECT DISTINCT(hash) FROM {d}.{tab} WHERE hash in ('{h}')".format(d=db, tab=fname, h="', '".join(hashlist))) 264 | ] 265 | setnone = [h for h in hashlist if not h in remaining] 266 | self.execute("UPDATE {d}.features SET {col} = 'None' WHERE hash IN ('{h}')".format(d=db, col=fname, h="', '".join(setnone))) 267 | else: 268 | self.execute("UPDATE {d}.features SET {col} = '{default}' WHERE {w}".format(d=db, col=fname, default=finfo.default, w=where)) 269 | 270 | def delete_hashes_entirely(self, hashes, target_db=None): 271 | tables = self.get_tables([target_db]) 272 | for table in tables: 273 | self.execute("DELETE FROM {}.{} WHERE hash IN ('{h}')".format(target_db, table, h="', '".join(hashes))) 274 | 275 | def copy_feature(self, old_name, new_name, target_db, hashlist=[]): 276 | old_finfo = self.find(old_name) 277 | data = self.query( 278 | "SELECT hash, {col} FROM {d}.{tab} WHERE hash IN ('{h}')".format( 279 | d=old_finfo.database, col=old_finfo.column, tab=old_finfo.table, h="', '".join(hashlist) 280 | ) 281 | ) 282 | for hash, value in data: 283 | self.set_values(new_name, value, [hash], target_db) 284 | -------------------------------------------------------------------------------- /gbd_server/static/w3.js: -------------------------------------------------------------------------------- 1 | /* W3.JS 1.04 April 2019 by w3schools.com */ 2 | "use strict"; 3 | var w3 = {}; 4 | w3.hide = function (sel) { 5 | w3.hideElements(w3.getElements(sel)); 6 | }; 7 | w3.hideElements = function (elements) { 8 | var i, l = elements.length; 9 | for (i = 0; i < l; i++) { 10 | w3.hideElement(elements[i]); 11 | } 12 | }; 13 | w3.hideElement = function (element) { 14 | w3.styleElement(element, "display", "none"); 15 | }; 16 | w3.show = function (sel, a) { 17 | var elements = w3.getElements(sel); 18 | if (a) {w3.hideElements(elements);} 19 | w3.showElements(elements); 20 | }; 21 | w3.showElements = function (elements) { 22 | var i, l = elements.length; 23 | for (i = 0; i < l; i++) { 24 | w3.showElement(elements[i]); 25 | } 26 | }; 27 | w3.showElement = function (element) { 28 | w3.styleElement(element, "display", "block"); 29 | }; 30 | w3.addStyle = function (sel, prop, val) { 31 | w3.styleElements(w3.getElements(sel), prop, val); 32 | }; 33 | w3.styleElements = function (elements, prop, val) { 34 | var i, l = elements.length; 35 | for (i = 0; i < l; i++) { 36 | w3.styleElement(elements[i], prop, val); 37 | } 38 | }; 39 | w3.styleElement = function (element, prop, val) { 40 | element.style.setProperty(prop, val); 41 | }; 42 | w3.toggleShow = function (sel) { 43 | var i, x = w3.getElements(sel), l = x.length; 44 | for (i = 0; i < l; i++) { 45 | if (x[i].style.display == "none") { 46 | w3.styleElement(x[i], "display", "block"); 47 | } else { 48 | w3.styleElement(x[i], "display", "none"); 49 | } 50 | } 51 | }; 52 | w3.addClass = function (sel, name) { 53 | w3.addClassElements(w3.getElements(sel), name); 54 | }; 55 | w3.addClassElements = function (elements, name) { 56 | var i, l = elements.length; 57 | for (i = 0; i < l; i++) { 58 | w3.addClassElement(elements[i], name); 59 | } 60 | }; 61 | w3.addClassElement = function (element, name) { 62 | var i, arr1, arr2; 63 | arr1 = element.className.split(" "); 64 | arr2 = name.split(" "); 65 | for (i = 0; i < arr2.length; i++) { 66 | if (arr1.indexOf(arr2[i]) == -1) {element.className += " " + arr2[i];} 67 | } 68 | }; 69 | w3.removeClass = function (sel, name) { 70 | w3.removeClassElements(w3.getElements(sel), name); 71 | }; 72 | w3.removeClassElements = function (elements, name) { 73 | var i, l = elements.length, arr1, arr2, j; 74 | for (i = 0; i < l; i++) { 75 | w3.removeClassElement(elements[i], name); 76 | } 77 | }; 78 | w3.removeClassElement = function (element, name) { 79 | var i, arr1, arr2; 80 | arr1 = element.className.split(" "); 81 | arr2 = name.split(" "); 82 | for (i = 0; i < arr2.length; i++) { 83 | while (arr1.indexOf(arr2[i]) > -1) { 84 | arr1.splice(arr1.indexOf(arr2[i]), 1); 85 | } 86 | } 87 | element.className = arr1.join(" "); 88 | }; 89 | w3.toggleClass = function (sel, c1, c2) { 90 | w3.toggleClassElements(w3.getElements(sel), c1, c2); 91 | }; 92 | w3.toggleClassElements = function (elements, c1, c2) { 93 | var i, l = elements.length; 94 | for (i = 0; i < l; i++) { 95 | w3.toggleClassElement(elements[i], c1, c2); 96 | } 97 | }; 98 | w3.toggleClassElement = function (element, c1, c2) { 99 | var t1, t2, t1Arr, t2Arr, j, arr, allPresent; 100 | t1 = (c1 || ""); 101 | t2 = (c2 || ""); 102 | t1Arr = t1.split(" "); 103 | t2Arr = t2.split(" "); 104 | arr = element.className.split(" "); 105 | if (t2Arr.length == 0) { 106 | allPresent = true; 107 | for (j = 0; j < t1Arr.length; j++) { 108 | if (arr.indexOf(t1Arr[j]) == -1) {allPresent = false;} 109 | } 110 | if (allPresent) { 111 | w3.removeClassElement(element, t1); 112 | } else { 113 | w3.addClassElement(element, t1); 114 | } 115 | } else { 116 | allPresent = true; 117 | for (j = 0; j < t1Arr.length; j++) { 118 | if (arr.indexOf(t1Arr[j]) == -1) {allPresent = false;} 119 | } 120 | if (allPresent) { 121 | w3.removeClassElement(element, t1); 122 | w3.addClassElement(element, t2); 123 | } else { 124 | w3.removeClassElement(element, t2); 125 | w3.addClassElement(element, t1); 126 | } 127 | } 128 | }; 129 | w3.getElements = function (id) { 130 | if (typeof id == "object") { 131 | return [id]; 132 | } else { 133 | return document.querySelectorAll(id); 134 | } 135 | }; 136 | w3.filterHTML = function(id, sel, filter) { 137 | var a, b, c, i, ii, iii, hit; 138 | a = w3.getElements(id); 139 | for (i = 0; i < a.length; i++) { 140 | b = a[i].querySelectorAll(sel); 141 | for (ii = 0; ii < b.length; ii++) { 142 | hit = 0; 143 | if (b[ii].innerText.toUpperCase().indexOf(filter.toUpperCase()) > -1) { 144 | hit = 1; 145 | } 146 | c = b[ii].getElementsByTagName("*"); 147 | for (iii = 0; iii < c.length; iii++) { 148 | if (c[iii].innerText.toUpperCase().indexOf(filter.toUpperCase()) > -1) { 149 | hit = 1; 150 | } 151 | } 152 | if (hit == 1) { 153 | b[ii].style.display = ""; 154 | } else { 155 | b[ii].style.display = "none"; 156 | } 157 | } 158 | } 159 | }; 160 | w3.sortHTML = function(id, sel, sortvalue) { 161 | var a, b, i, ii, y, bytt, v1, v2, cc, j; 162 | a = w3.getElements(id); 163 | for (i = 0; i < a.length; i++) { 164 | for (j = 0; j < 2; j++) { 165 | cc = 0; 166 | y = 1; 167 | while (y == 1) { 168 | y = 0; 169 | b = a[i].querySelectorAll(sel); 170 | for (ii = 0; ii < (b.length - 1); ii++) { 171 | bytt = 0; 172 | if (sortvalue) { 173 | v1 = b[ii].querySelector(sortvalue).innerText; 174 | v2 = b[ii + 1].querySelector(sortvalue).innerText; 175 | } else { 176 | v1 = b[ii].innerText; 177 | v2 = b[ii + 1].innerText; 178 | } 179 | v1 = v1.toLowerCase(); 180 | v2 = v2.toLowerCase(); 181 | if ((j == 0 && (v1 > v2)) || (j == 1 && (v1 < v2))) { 182 | bytt = 1; 183 | break; 184 | } 185 | } 186 | if (bytt == 1) { 187 | b[ii].parentNode.insertBefore(b[ii + 1], b[ii]); 188 | y = 1; 189 | cc++; 190 | } 191 | } 192 | if (cc > 0) {break;} 193 | } 194 | } 195 | }; 196 | w3.slideshow = function (sel, ms, func) { 197 | var i, ss, x = w3.getElements(sel), l = x.length; 198 | ss = {}; 199 | ss.current = 1; 200 | ss.x = x; 201 | ss.ondisplaychange = func; 202 | if (!isNaN(ms) || ms == 0) { 203 | ss.milliseconds = ms; 204 | } else { 205 | ss.milliseconds = 1000; 206 | } 207 | ss.start = function() { 208 | ss.display(ss.current) 209 | if (ss.ondisplaychange) {ss.ondisplaychange();} 210 | if (ss.milliseconds > 0) { 211 | window.clearTimeout(ss.timeout); 212 | ss.timeout = window.setTimeout(ss.next, ss.milliseconds); 213 | } 214 | }; 215 | ss.next = function() { 216 | ss.current += 1; 217 | if (ss.current > ss.x.length) {ss.current = 1;} 218 | ss.start(); 219 | }; 220 | ss.previous = function() { 221 | ss.current -= 1; 222 | if (ss.current < 1) {ss.current = ss.x.length;} 223 | ss.start(); 224 | }; 225 | ss.display = function (n) { 226 | w3.styleElements(ss.x, "display", "none"); 227 | w3.styleElement(ss.x[n - 1], "display", "block"); 228 | } 229 | ss.start(); 230 | return ss; 231 | }; 232 | w3.includeHTML = function(cb) { 233 | var z, i, elmnt, file, xhttp; 234 | z = document.getElementsByTagName("*"); 235 | for (i = 0; i < z.length; i++) { 236 | elmnt = z[i]; 237 | file = elmnt.getAttribute("w3-include-html"); 238 | if (file) { 239 | xhttp = new XMLHttpRequest(); 240 | xhttp.onreadystatechange = function() { 241 | if (this.readyState == 4) { 242 | if (this.status == 200) {elmnt.innerHTML = this.responseText;} 243 | if (this.status == 404) {elmnt.innerHTML = "Page not found.";} 244 | elmnt.removeAttribute("w3-include-html"); 245 | w3.includeHTML(cb); 246 | } 247 | } 248 | xhttp.open("GET", file, true); 249 | xhttp.send(); 250 | return; 251 | } 252 | } 253 | if (cb) cb(); 254 | }; 255 | w3.getHttpData = function (file, func) { 256 | w3.http(file, function () { 257 | if (this.readyState == 4 && this.status == 200) { 258 | func(this.responseText); 259 | } 260 | }); 261 | }; 262 | w3.getHttpObject = function (file, func) { 263 | w3.http(file, function () { 264 | if (this.readyState == 4 && this.status == 200) { 265 | func(JSON.parse(this.responseText)); 266 | } 267 | }); 268 | }; 269 | w3.displayHttp = function (id, file) { 270 | w3.http(file, function () { 271 | if (this.readyState == 4 && this.status == 200) { 272 | w3.displayObject(id, JSON.parse(this.responseText)); 273 | } 274 | }); 275 | }; 276 | w3.http = function (target, readyfunc, xml, method) { 277 | var httpObj; 278 | if (!method) {method = "GET"; } 279 | if (window.XMLHttpRequest) { 280 | httpObj = new XMLHttpRequest(); 281 | } else if (window.ActiveXObject) { 282 | httpObj = new ActiveXObject("Microsoft.XMLHTTP"); 283 | } 284 | if (httpObj) { 285 | if (readyfunc) {httpObj.onreadystatechange = readyfunc;} 286 | httpObj.open(method, target, true); 287 | httpObj.send(xml); 288 | } 289 | }; 290 | w3.getElementsByAttribute = function (x, att) { 291 | var arr = [], arrCount = -1, i, l, y = x.getElementsByTagName("*"), z = att.toUpperCase(); 292 | l = y.length; 293 | for (i = -1; i < l; i += 1) { 294 | if (i == -1) {y[i] = x;} 295 | if (y[i].getAttribute(z) !== null) {arrCount += 1; arr[arrCount] = y[i];} 296 | } 297 | return arr; 298 | }; 299 | w3.dataObject = {}, 300 | w3.displayObject = function (id, data) { 301 | var htmlObj, htmlTemplate, html, arr = [], a, l, rowClone, x, j, i, ii, cc, repeat, repeatObj, repeatX = ""; 302 | htmlObj = document.getElementById(id); 303 | htmlTemplate = init_template(id, htmlObj); 304 | html = htmlTemplate.cloneNode(true); 305 | arr = w3.getElementsByAttribute(html, "w3-repeat"); 306 | l = arr.length; 307 | for (j = (l - 1); j >= 0; j -= 1) { 308 | cc = arr[j].getAttribute("w3-repeat").split(" "); 309 | if (cc.length == 1) { 310 | repeat = cc[0]; 311 | } else { 312 | repeatX = cc[0]; 313 | repeat = cc[2]; 314 | } 315 | arr[j].removeAttribute("w3-repeat"); 316 | repeatObj = data[repeat]; 317 | if (repeatObj && typeof repeatObj == "object" && repeatObj.length != "undefined") { 318 | i = 0; 319 | for (x in repeatObj) { 320 | i += 1; 321 | rowClone = arr[j]; 322 | rowClone = w3_replace_curly(rowClone, "element", repeatX, repeatObj[x]); 323 | a = rowClone.attributes; 324 | for (ii = 0; ii < a.length; ii += 1) { 325 | a[ii].value = w3_replace_curly(a[ii], "attribute", repeatX, repeatObj[x]).value; 326 | } 327 | (i === repeatObj.length) ? arr[j].parentNode.replaceChild(rowClone, arr[j]) : arr[j].parentNode.insertBefore(rowClone, arr[j]); 328 | } 329 | } else { 330 | console.log("w3-repeat must be an array. " + repeat + " is not an array."); 331 | continue; 332 | } 333 | } 334 | html = w3_replace_curly(html, "element"); 335 | htmlObj.parentNode.replaceChild(html, htmlObj); 336 | function init_template(id, obj) { 337 | var template; 338 | template = obj.cloneNode(true); 339 | if (w3.dataObject.hasOwnProperty(id)) {return w3.dataObject[id];} 340 | w3.dataObject[id] = template; 341 | return template; 342 | } 343 | function w3_replace_curly(elmnt, typ, repeatX, x) { 344 | var value, rowClone, pos1, pos2, originalHTML, lookFor, lookForARR = [], i, cc, r; 345 | rowClone = elmnt.cloneNode(true); 346 | pos1 = 0; 347 | while (pos1 > -1) { 348 | originalHTML = (typ == "attribute") ? rowClone.value : rowClone.innerHTML; 349 | pos1 = originalHTML.indexOf("{{", pos1); 350 | if (pos1 === -1) {break;} 351 | pos2 = originalHTML.indexOf("}}", pos1 + 1); 352 | lookFor = originalHTML.substring(pos1 + 2, pos2); 353 | lookForARR = lookFor.split("||"); 354 | value = undefined; 355 | for (i = 0; i < lookForARR.length; i += 1) { 356 | lookForARR[i] = lookForARR[i].replace(/^\s+|\s+$/gm, ''); //trim 357 | if (x) {value = x[lookForARR[i]];} 358 | if (value == undefined && data) {value = data[lookForARR[i]];} 359 | if (value == undefined) { 360 | cc = lookForARR[i].split("."); 361 | if (cc[0] == repeatX) {value = x[cc[1]]; } 362 | } 363 | if (value == undefined) { 364 | if (lookForARR[i] == repeatX) {value = x;} 365 | } 366 | if (value == undefined) { 367 | if (lookForARR[i].substr(0, 1) == '"') { 368 | value = lookForARR[i].replace(/"/g, ""); 369 | } else if (lookForARR[i].substr(0,1) == "'") { 370 | value = lookForARR[i].replace(/'/g, ""); 371 | } 372 | } 373 | if (value != undefined) {break;} 374 | } 375 | if (value != undefined) { 376 | r = "{{" + lookFor + "}}"; 377 | if (typ == "attribute") { 378 | rowClone.value = rowClone.value.replace(r, value); 379 | } else { 380 | w3_replace_html(rowClone, r, value); 381 | } 382 | } 383 | pos1 = pos1 + 1; 384 | } 385 | return rowClone; 386 | } 387 | function w3_replace_html(a, r, result) { 388 | var b, l, i, a, x, j; 389 | if (a.hasAttributes()) { 390 | b = a.attributes; 391 | l = b.length; 392 | for (i = 0; i < l; i += 1) { 393 | if (b[i].value.indexOf(r) > -1) {b[i].value = b[i].value.replace(r, result);} 394 | } 395 | } 396 | x = a.getElementsByTagName("*"); 397 | l = x.length; 398 | a.innerHTML = a.innerHTML.replace(r, result); 399 | } 400 | }; -------------------------------------------------------------------------------- /gbd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # MIT License 4 | 5 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 6 | 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | 17 | import os 18 | import sys 19 | import traceback 20 | import polars as pl 21 | 22 | from gbd_core.api import GBD, GBDException 23 | from gbd_core.grammar import ParserException 24 | from gbd_core import util, contexts, schema 25 | from gbd_core.util_argparse import * 26 | from gbd_init.feature_extractors import generic_extractors 27 | from gbd_init.instance_transformers import generic_transformers 28 | 29 | 30 | ### Command-Line Interface Entry Points 31 | def cli_hash(api: GBD, args): 32 | from gbd_core.contexts import identify 33 | 34 | print(identify(args.path)) 35 | 36 | 37 | def cli_init_local(api: GBD, args): 38 | from gbd_init.feature_extractors import init_local 39 | 40 | rlimits = {"jobs": args.jobs, "tlim": args.tlim, "mlim": args.mlim, "flim": args.flim} 41 | init_local(api, rlimits, args.path, args.target) 42 | 43 | 44 | def cli_init_generic(api: GBD, args): 45 | from gbd_init.feature_extractors import init_features_generic 46 | 47 | rlimits = {"jobs": args.jobs, "tlim": args.tlim, "mlim": args.mlim, "flim": args.flim} 48 | context = api.database.dcontext(args.target) 49 | df = api.query(args.query, args.hashes, [context + ":local"], collapse="MIN", group_by=context + ":hash") 50 | init_features_generic(args.initfuncname, api, rlimits, df, args.target) 51 | 52 | 53 | def cli_trans_generic(api: GBD, args): 54 | from gbd_init.instance_transformers import transform_instances_generic 55 | 56 | rlimits = {"jobs": args.jobs, "tlim": args.tlim, "mlim": args.mlim, "flim": args.flim} 57 | transform_instances_generic(args.transfuncname, api, rlimits, args.query, args.hashes, args.target, args.source, args.collapse) 58 | 59 | 60 | def cli_create(api: GBD, args): 61 | api.create_feature(args.name, args.unique, args.target) 62 | 63 | 64 | def cli_delete(api: GBD, args): 65 | if (args.hashes and len(args.hashes) or args.values and len(args.values)) and args.name: 66 | if args.force or util.confirm("Delete attributes of given hashes and/or values from '{}'?".format(args.name)): 67 | api.reset_values(args.name, args.values, args.hashes) 68 | elif args.hashes and not args.name: 69 | if args.force or util.confirm("Delete given hashes entirely?".format(args.name)): 70 | api.delete_hashes(args.hashes) 71 | elif args.force or util.confirm("Delete feature '{}' and all associated attributes?".format(args.name)): 72 | api.delete_feature(args.name) 73 | 74 | 75 | def cli_cleanup(api: GBD, args): 76 | if args.hashes and len(args.hashes): 77 | if args.force or util.confirm("Delete attributes of given hashes from all features?"): 78 | api.delete_hashes(args.hashes, args.target) 79 | 80 | 81 | def cli_rename(api: GBD, args): 82 | api.rename_feature(args.old_name, args.new_name) 83 | 84 | 85 | def cli_copy(api: GBD, args): 86 | api.copy_feature(args.old_name, args.new_name, args.target, args.query, args.hashes) 87 | 88 | 89 | def cli_get(api: GBD, args): 90 | df: pl.DataFrame = api.query(args.query, args.hashes, args.resolve, args.collapse, args.group_by, args.join_type) 91 | if args.header: 92 | print(args.delimiter.join(df.columns)) 93 | for row in df.iter_rows(named=True): 94 | print(args.delimiter.join([str(row[col]) if row[col] is not None else "[None]" for col in df.columns])) 95 | 96 | 97 | def cli_set(api: GBD, args): 98 | hashes = api.query(args.query, args.hashes)["hash"].to_list() 99 | if args.create: 100 | hashes = list(set(hashes + args.hashes)) 101 | if len(hashes) > 0: 102 | api.set_values(args.assign[0], args.assign[1], hashes) 103 | 104 | 105 | def cli_info(api: GBD, args): 106 | if args.contexts: 107 | print("# Available Contexts: " + ", ".join(contexts.contexts())) 108 | for context in contexts.contexts(): 109 | print() 110 | print("## " + contexts.description(context)) 111 | print(" - Context Prefix: " + context) 112 | print(" - File Extensions: " + ",".join(contexts.suffixes(context))) 113 | elif args.name is None: 114 | print("# Available Data Sources: " + ", ".join(api.get_databases())) 115 | for dbname in api.get_databases(): 116 | if len(api.get_features(dbname)): 117 | print() 118 | print("## " + api.get_database_path(dbname)) 119 | print(" - Name: " + dbname) 120 | feat = api.get_features(dbname) 121 | print(" - Features: " + " ".join(feat)) 122 | if args.verbose: 123 | for f in feat: 124 | info = api.database.find(":".join([dbname, f])) 125 | print(info) 126 | else: 127 | info = api.get_feature_info(args.name) 128 | for key in info: 129 | print("{}: {}".format(key, info[key])) 130 | 131 | 132 | def cli_server(api: GBD, args): 133 | from gbd_server import server 134 | 135 | util.eprint("Starting GBD Server on port {}...".format(args.port)) 136 | util.eprint(r""" 137 | Warning: All files referenced in the configured databases are now accessible on the specified port. 138 | If you do not trust the source of the databases, do not run the server. 139 | """) 140 | server.serve(api, args.port, args.logdir) 141 | 142 | 143 | ### Define Command-Line Interface and Map Sub-Commands to Methods 144 | def main(): 145 | parser = get_gbd_argparser() 146 | 147 | subparsers = parser.add_subparsers(help="Available Commands:", required=True, dest="gbd command") 148 | 149 | # INITIALIZATION 150 | parser_init = subparsers.add_parser("init", help="Initialize Database") 151 | add_resource_limits_arguments(parser_init) 152 | parser_init.add_argument("--target", help="Target database for new features (default: first db in list); also determines target context", default=None) 153 | 154 | parser_init_subparsers = parser_init.add_subparsers(help="Select Initialization Procedure:", required=True, dest="init what?") 155 | 156 | # init local paths: 157 | parser_init_local = parser_init_subparsers.add_parser("local", help="Initialize Local Hash/Path Entries") 158 | parser_init_local.add_argument("path", type=directory_type, help="Path to benchmarks") 159 | parser_init_local.set_defaults(func=cli_init_local) 160 | 161 | # hooks for generic feature extractors: 162 | for key in generic_extractors.keys(): 163 | gex = generic_extractors[key] 164 | parser_init_generic = parser_init_subparsers.add_parser(key, help=gex["description"]) 165 | add_query_and_hashes_arguments(parser_init_generic) 166 | parser_init_generic.set_defaults(func=cli_init_generic, initfuncname=key) 167 | 168 | # TRANSFORMATION 169 | parser_trans = subparsers.add_parser("transform", help="Transform Benchmarks") 170 | add_resource_limits_arguments(parser_trans) 171 | parser_trans.add_argument("--source", help="Source context", default=contexts.default_context()) 172 | parser_trans.add_argument("--target", help="Target database; determines target context (default: first db in list)", default=None) 173 | 174 | parser_trans_subparsers = parser_trans.add_subparsers(help="Select Transformation Procedure:", required=True, dest="transform how?") 175 | 176 | # hooks for generic instance transformers: 177 | for key in generic_transformers.keys(): 178 | gex = generic_transformers[key] 179 | parser_trans_generic = parser_trans_subparsers.add_parser(key, help=gex["description"]) 180 | add_query_and_hashes_arguments(parser_trans_generic) 181 | parser_trans_generic.set_defaults(func=cli_trans_generic, transfuncname=key) 182 | parser_trans_generic.add_argument( 183 | "-c", 184 | "--collapse", 185 | default="group_concat", 186 | choices=["group_concat", "min", "max", "avg", "count", "sum", "none"], 187 | help="Specify a function for the handling of multiple feature values", 188 | ) 189 | 190 | # GBD HASH 191 | parser_hash = subparsers.add_parser("hash", help="Print hash for a single file") 192 | parser_hash.add_argument("path", type=file_type, help="Path to one benchmark") 193 | parser_hash.set_defaults(func=cli_hash) 194 | 195 | # GBD GET $QUERY 196 | parser_get = subparsers.add_parser("get", help="Get data by query (or hash-list via stdin)") 197 | add_query_and_hashes_arguments(parser_get) 198 | parser_get.add_argument("-r", "--resolve", help="List of feature names to resolve against", nargs="+", default=[]) 199 | parser_get.add_argument( 200 | "-c", 201 | "--collapse", 202 | default="group_concat", 203 | choices=["group_concat", "min", "max", "avg", "count", "sum", "none"], 204 | help="Specify a function for the handling of multiple feature values", 205 | ) 206 | parser_get.add_argument("-g", "--group_by", default=None, help="Group by the specified feature as the key, rather than by the primary key") 207 | parser_get.add_argument("--join-type", help="Join Type: treatment of missing values", choices=["INNER", "OUTER", "LEFT"], default="LEFT") 208 | parser_get.add_argument("-d", "--delimiter", default=" ", help="CSV delimiter to use in output") 209 | parser_get.add_argument("-H", "--header", action="store_true", help="Include header information in output") 210 | parser_get.set_defaults(func=cli_get) 211 | 212 | # GBD SET 213 | parser_set = subparsers.add_parser("set", help="Set specified attribute-value for query result") 214 | parser_set.add_argument("assign", type=key_value_type, help="key=value") 215 | parser_set.add_argument( 216 | "-c", "--create", help="Create given hashes if they do not exist yet (otherwise intersect with existing hashes)", action="store_true" 217 | ) 218 | add_query_and_hashes_arguments(parser_set) 219 | parser_set.set_defaults(func=cli_set) 220 | 221 | # CREATE/DELETE/MODIFY FEATURES 222 | parser_create = subparsers.add_parser("create", help="Create a new feature") 223 | parser_create.add_argument("name", type=column_type, help="Name of feature") 224 | parser_create.add_argument("-u", "--unique", help="Unique constraint: specify default-value of feature") 225 | parser_create.add_argument("--target", help="Target database (default: first in list)", default=None) 226 | parser_create.set_defaults(func=cli_create) 227 | 228 | parser_delete = subparsers.add_parser( 229 | "delete", help="Delete all values assiociated with given hashes (via argument or stdin) or remove feature if no hashes are given" 230 | ) 231 | parser_delete.add_argument("--hashes", help="Hashes for which to delete values", nargs="*", default=[]) 232 | parser_delete.add_argument("--values", help="Values to delete", nargs="*", default=[]) 233 | parser_delete.add_argument("name", type=column_type, help="Name of feature (default: all)", nargs="?") 234 | parser_delete.add_argument("-f", "--force", action="store_true", help="Do not ask for confirmation") 235 | parser_delete.set_defaults(func=cli_delete) 236 | 237 | parser_cleanup = subparsers.add_parser("cleanup", help="Delete given hashes from all features") 238 | parser_cleanup.add_argument("--hashes", help="Hashes for which to delete values", nargs="*", default=[]) 239 | parser_cleanup.add_argument("-f", "--force", action="store_true", help="Do not ask for confirmation") 240 | parser_cleanup.add_argument("--target", help="Target database (default: first in list)", default=None) 241 | parser_cleanup.set_defaults(func=cli_cleanup) 242 | 243 | parser_rename = subparsers.add_parser("rename", help="Rename feature") 244 | parser_rename.add_argument("old_name", type=column_type, help="Old name of feature") 245 | parser_rename.add_argument("new_name", type=column_type, help="New name of feature") 246 | parser_rename.set_defaults(func=cli_rename) 247 | 248 | parser_copy = subparsers.add_parser("copy", help="Copy feature") 249 | add_query_and_hashes_arguments(parser_copy) 250 | parser_copy.add_argument("--target", help="Target database (default: first in list)", default=None) 251 | parser_copy.add_argument("old_name", type=column_type, help="Old name of feature") 252 | parser_copy.add_argument("new_name", type=column_type, help="New name of feature") 253 | parser_copy.set_defaults(func=cli_copy) 254 | 255 | # GET META INFO 256 | parser_info = subparsers.add_parser("info", help="Print info about available features") 257 | parser_info.add_argument("-c", "--contexts", action="store_true", help="Print available contexts") 258 | parser_info.add_argument("name", type=column_type, help="Print info about specified feature", nargs="?") 259 | parser_info.set_defaults(func=cli_info) 260 | 261 | # RUN SERVER 262 | parser_server = subparsers.add_parser("serve", help="Run GBD Server") 263 | parser_server.add_argument("-p", "--port", help="Specify port on which to listen", default=os.environ.get("GBD_PORT") or 5000, type=int) 264 | parser_server.add_argument("-l", "--logdir", help="Specify directory for logfiles", default=os.environ.get("GBD_LOGS") or "./") 265 | parser_server.set_defaults(func=cli_server) 266 | 267 | # PARSE ARGUMENTS 268 | args = parser.parse_args() 269 | try: 270 | if hasattr(args, "hashes") and not sys.stdin.isatty(): 271 | if not args.hashes or len(args.hashes) == 0: 272 | args.hashes = util.read_hashes() # read hashes from stdin 273 | if hasattr(args, "target") and args.target is None: 274 | args.target = schema.Schema.dbname_from_path(args.db.split(os.pathsep)[0]) 275 | if args.db is None or len(args.db) == 0: 276 | util.eprint("No database specified. Use -d or set GBD_DB environment variable.") 277 | sys.exit(1) 278 | with GBD(args.db.split(os.pathsep), args.verbose) as api: 279 | args.func(api, args) 280 | except ModuleNotFoundError as e: 281 | util.eprint("Module '{}' not found. Please install it.".format(e.name)) 282 | if e.name == "gbdc": 283 | util.eprint("Find installation instructions at https://github.com/Udopia/gbdc") 284 | sys.exit(1) 285 | except ParserException as e: 286 | util.eprint("Failed to parse query: " + args.query) 287 | if args.verbose: 288 | util.eprint(traceback.format_exc()) 289 | sys.exit(1) 290 | except pl.exceptions.DataOrientationWarning as e: 291 | util.eprint(traceback.format_exc()) 292 | except Exception as e: 293 | util.eprint("{}: {}".format(type(e), str(e))) 294 | if args.verbose: 295 | util.eprint(traceback.format_exc()) 296 | sys.exit(1) 297 | 298 | 299 | if __name__ == "__main__": 300 | main() 301 | -------------------------------------------------------------------------------- /gbd_core/schema.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT) 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import sqlite3 16 | import typing 17 | import os 18 | import csv 19 | import re 20 | 21 | from dataclasses import dataclass 22 | 23 | from gbd_core import contexts 24 | from gbd_core.util import eprint, confirm 25 | 26 | 27 | class SchemaException(Exception): 28 | pass 29 | 30 | 31 | @dataclass 32 | class FeatureInfo: 33 | name: str = None 34 | database: str = None 35 | table: str = None 36 | column: str = None 37 | default: str = None 38 | 39 | 40 | class Schema: 41 | def __init__(self, dbcon, dbname, path, features, context, csv=False): 42 | self.dbname = dbname 43 | self.path = path 44 | self.features = features 45 | self.context = context 46 | self.dbcon = dbcon 47 | self.csv = csv 48 | 49 | @classmethod 50 | def is_database(cls, path): 51 | if os.path.isfile(path): 52 | sz = os.path.getsize(path) 53 | if sz == 0: 54 | return True # new sqlite3 files can be empty 55 | if sz < 100: 56 | return False # sqlite header is 100 bytes 57 | with open(path, "rb") as fd: 58 | header = fd.read(100) # validate header 59 | return header[:16] == b"SQLite format 3\x00" 60 | elif confirm("Database '{}' does not exist. Create new database?".format(path)): 61 | sqlite3.connect(path).close() 62 | return True 63 | else: 64 | raise SchemaException("Database '{}' does not exist".format(path)) 65 | 66 | @classmethod 67 | def create(cls, path): 68 | try: 69 | if cls.is_database(path): 70 | return cls.from_database(path) 71 | else: 72 | return cls.from_csv(path) 73 | except Exception as e: 74 | raise SchemaException(str(e)) 75 | 76 | @classmethod 77 | def from_database(cls, path): 78 | dbname = cls.dbname_from_path(path) 79 | con = sqlite3.connect(path) 80 | features = cls.features_from_database(dbname, path, con) 81 | context = cls.context_from_database(dbname) 82 | return cls(con, dbname, path, features, context) 83 | 84 | @classmethod 85 | def from_csv(cls, path): 86 | dbname = cls.dbname_from_path(path) 87 | con = sqlite3.connect("file:{}?mode=memory&cache=shared".format(dbname), uri=True) 88 | features = cls.features_from_csv(dbname, path, con) 89 | context = cls.context_from_csv(dbname) 90 | return cls(con, dbname, path, features, context, True) 91 | 92 | # Import CSV to in-memory db, create according schema info 93 | @classmethod 94 | def features_from_csv(cls, dbname, path, con) -> typing.Dict[str, FeatureInfo]: 95 | features = dict() 96 | with open(path) as csvfile: 97 | temp_lines = csvfile.readline() + "\n" + csvfile.readline() 98 | dialect = csv.Sniffer().sniff(temp_lines, delimiters=";, \t") 99 | csvfile.seek(0) 100 | csvreader = csv.DictReader(csvfile, dialect=dialect) 101 | if "hash" in csvreader.fieldnames: 102 | cols = [re.sub("[^0-9a-zA-Z]+", "_", n) for n in csvreader.fieldnames] 103 | for colname in cols: 104 | features[colname] = FeatureInfo(colname, dbname, "features", colname, None) 105 | con.execute("CREATE TABLE IF NOT EXISTS {} ({})".format("features", ", ".join(cols))) 106 | for row in csvreader: 107 | con.execute("INSERT INTO {} VALUES ('{}')".format("features", "', '".join(row.values()))) 108 | con.commit() 109 | else: 110 | raise SchemaException("Column 'hash' not found in {}".format(csvfile)) 111 | return features 112 | 113 | # Create schema info for sqlite database 114 | @classmethod 115 | def features_from_database(cls, dbname, path, con) -> typing.Dict[str, FeatureInfo]: 116 | features = dict() 117 | sql_tables = "SELECT tbl_name FROM sqlite_master WHERE type = 'table'" 118 | tables = [tab for (tab,) in con.execute(sql_tables).fetchall() if not tab.startswith("_")] 119 | for table in tables: 120 | columns = con.execute("PRAGMA table_info({})".format(table)).fetchall() 121 | for index, colname, coltype, notnull, default_value, pk in columns: 122 | is_fk_column = table == "features" and colname in tables 123 | is_fk_hash = table != "features" and colname == "hash" 124 | if not is_fk_column and not is_fk_hash: 125 | fname = colname if table == "features" else table 126 | dval = default_value.strip('"') if default_value else None 127 | features[fname] = FeatureInfo(fname, dbname, table, colname, dval) 128 | return features 129 | 130 | @classmethod 131 | def context_from_csv(cls, path): 132 | return cls.context_from_name(Schema.dbname_from_path(path)) 133 | 134 | @classmethod 135 | def context_from_database(cls, path): 136 | # TODO: store context in database 137 | return cls.context_from_name(Schema.dbname_from_path(path)) 138 | 139 | @classmethod 140 | def context_from_name(cls, name): 141 | pair = name.split("_") 142 | if len(pair) > 1 and pair[0] in contexts.contexts(): 143 | return pair[0] 144 | else: 145 | return contexts.default_context() 146 | 147 | @classmethod 148 | def dbname_from_path(cls, path): 149 | filename = os.path.splitext(os.path.basename(path))[0] 150 | if filename[0].isdigit(): 151 | filename = contexts.default_context() + "_" + filename 152 | return re.sub("[^a-zA-Z0-9]", "_", filename) 153 | 154 | @classmethod 155 | def valid_feature_or_raise(cls, name): 156 | if not re.match("[a-zA-Z][a-zA-Z0-9_]*", name): 157 | raise SchemaException("Feature name '{}' must be alphanumeric (incl. underline) and start with a letter.".format(name)) 158 | # gbd_keywords = [ 'hash', 'value', 'local', 'filename', 'features' ] 159 | gbd_keywords = ["hash", "value", "features"] 160 | if name.lower() in gbd_keywords: 161 | raise SchemaException("Feature name '{}' is reserved.".format(name)) 162 | sqlite_keywords = [ 163 | "abort", 164 | "action", 165 | "add", 166 | "after", 167 | "all", 168 | "alter", 169 | "always", 170 | "analyze", 171 | "and", 172 | "as", 173 | "asc", 174 | "attach", 175 | "autoincrement", 176 | "before", 177 | "begin", 178 | "between", 179 | "by", 180 | "cascade", 181 | "case", 182 | "cast", 183 | "check", 184 | "collate", 185 | "column", 186 | "commit", 187 | "conflict", 188 | "constraint", 189 | "create", 190 | "cross", 191 | "current", 192 | "current_date", 193 | "current_time", 194 | "current_timestamp", 195 | "database", 196 | "default", 197 | "deferrable", 198 | "deferred", 199 | "delete", 200 | "desc", 201 | "detach", 202 | "distinct", 203 | "do", 204 | "drop", 205 | "each", 206 | "else", 207 | "end", 208 | "escape", 209 | "except", 210 | "exclude", 211 | "exclusive", 212 | "exists", 213 | "explain", 214 | "fail", 215 | "filter", 216 | "first", 217 | "following", 218 | "for", 219 | "foreign", 220 | "from", 221 | "full", 222 | "generated", 223 | "glob", 224 | "group", 225 | "groups", 226 | "having", 227 | "if", 228 | "ignore", 229 | "immediate", 230 | "in", 231 | "index", 232 | "indexed", 233 | "initially", 234 | "inner", 235 | "insert", 236 | "instead", 237 | "intersect", 238 | "into", 239 | "is", 240 | "isnull", 241 | "join", 242 | "key", 243 | "last", 244 | "left", 245 | "like", 246 | "limit", 247 | "match", 248 | "materialized", 249 | "natural", 250 | "no", 251 | "not", 252 | "nothing", 253 | "notnull", 254 | "null", 255 | "nulls", 256 | "of", 257 | "offset", 258 | "on", 259 | "or", 260 | "order", 261 | "others", 262 | "outer", 263 | "over", 264 | "partition", 265 | "plan", 266 | "pragma", 267 | "preceding", 268 | "primary", 269 | "query", 270 | "raise", 271 | "range", 272 | "recursive", 273 | "references", 274 | "regexp", 275 | "reindex", 276 | "release", 277 | "rename", 278 | "replace", 279 | "restrict", 280 | "returning", 281 | "right", 282 | "rollback", 283 | "row", 284 | "rows", 285 | "savepoint", 286 | "select", 287 | "set", 288 | "table", 289 | "temp", 290 | "temporary", 291 | "then", 292 | "ties", 293 | "to", 294 | "transaction", 295 | "trigger", 296 | "unbounded", 297 | "union", 298 | "unique", 299 | "update", 300 | "using", 301 | "vacuum", 302 | "values", 303 | "view", 304 | "virtual", 305 | "when", 306 | "where", 307 | "window", 308 | "with", 309 | "without", 310 | ] 311 | if name.lower() in sqlite_keywords or name.startswith("sqlite_"): 312 | raise SchemaException("Feature name '{}' is reserved by sqlite.".format(name)) 313 | 314 | def is_in_memory(self): 315 | return self.csv 316 | 317 | def get_connection(self): 318 | if self.is_in_memory(): 319 | return sqlite3.connect("file::memory:?cache=shared", uri=True) 320 | else: 321 | return sqlite3.connect(self.path) 322 | 323 | def execute(self, sql): 324 | con = self.get_connection() 325 | cur = con.cursor() 326 | cur.execute(sql) 327 | con.commit() 328 | con.close() 329 | 330 | def get_tables(self): 331 | return list(set([f.table for f in self.get_features()])) 332 | 333 | def get_features(self): 334 | return self.features.values() 335 | 336 | def has_feature(self, name): 337 | return name in self.features.keys() 338 | 339 | def absorb(self, schema): 340 | if self.is_in_memory() and schema.is_in_memory(): 341 | self.features.update(schema.features) 342 | else: 343 | raise SchemaException("Internal Error: Attempt to merge non-virtual schemata") 344 | 345 | def create_main_table_if_not_exists(self): 346 | main_table = "features" 347 | if not main_table in self.get_tables(): 348 | self.execute("CREATE TABLE IF NOT EXISTS {} (hash UNIQUE NOT NULL)".format(main_table)) 349 | # insert all known hashes into main table and create triggers 350 | for table in [t for t in self.get_tables() if t != main_table]: 351 | self.execute("INSERT OR IGNORE INTO {} (hash) SELECT DISTINCT(hash) FROM {}".format(main_table, table)) 352 | self.execute( 353 | """CREATE TRIGGER IF NOT EXISTS {}_dval AFTER INSERT ON {} 354 | BEGIN INSERT OR IGNORE INTO {} (hash) VALUES (NEW.hash); END""".format(table, table, main_table) 355 | ) 356 | self.features["hash"] = FeatureInfo("hash", self.dbname, main_table, "hash", None) 357 | return [self.features["hash"]] 358 | else: 359 | return [] 360 | 361 | def create_feature(self, name, default_value=None, permissive=False): 362 | if not permissive: # internal use can be unchecked, e.g., to create the reserved features during initialization 363 | Schema.valid_feature_or_raise(name) 364 | 365 | created = [] 366 | 367 | if not self.has_feature(name): 368 | # ensure existence of main table: 369 | created.extend(self.create_main_table_if_not_exists()) 370 | 371 | # create new feature: 372 | main_table = "features" 373 | self.execute("ALTER TABLE {} ADD {} TEXT NOT NULL DEFAULT {}".format(main_table, name, default_value or "None")) 374 | if default_value is not None: 375 | # feature is unique and resides in main features-table: 376 | self.features[name] = FeatureInfo(name, self.dbname, main_table, name, default_value) 377 | else: 378 | # feature is not unique and resides in a separate table (column in main features-table is a foreign key): 379 | self.execute("CREATE TABLE IF NOT EXISTS {} (hash TEXT NOT NULL, value TEXT NOT NULL, CONSTRAINT all_unique UNIQUE(hash, value))".format(name)) 380 | self.execute("INSERT INTO {} (hash, value) VALUES ('None', 'None')".format(name)) 381 | self.execute( 382 | """CREATE TRIGGER IF NOT EXISTS {}_hash AFTER INSERT ON {} 383 | BEGIN INSERT OR IGNORE INTO {} (hash) VALUES (NEW.hash); END""".format(name, name, main_table) 384 | ) 385 | self.features[name] = FeatureInfo(name, self.dbname, name, "value", None) 386 | 387 | # update schema: 388 | created.append(self.features[name]) 389 | 390 | elif not permissive: 391 | raise SchemaException("Feature '{}' already exists".format(name)) 392 | 393 | return created 394 | 395 | def set_values(self, feature, value, hashes): 396 | if not self.has_feature(feature): 397 | raise SchemaException("Feature '{}' does not exist".format(feature)) 398 | if not len(hashes): 399 | raise SchemaException("No hashes given") 400 | table = self.features[feature].table 401 | column = self.features[feature].column 402 | values = ", ".join(["('{}', '{}')".format(hash, value) for hash in hashes]) 403 | if self.features[feature].default is None: 404 | self.execute("INSERT OR IGNORE INTO {tab} (hash, {col}) VALUES {vals}".format(tab=table, col=column, vals=values)) 405 | self.execute("UPDATE features SET {col}=hash WHERE hash in ('{h}')".format(col=table, h="', '".join(hashes))) 406 | else: 407 | self.execute( 408 | "INSERT INTO {tab} (hash, {col}) VALUES {vals} ON CONFLICT (hash) DO UPDATE SET {col}='{val}' WHERE hash in ('{h}')".format( 409 | tab=table, col=column, val=value, vals=values, h="', '".join(hashes) 410 | ) 411 | ) 412 | --------------------------------------------------------------------------------