├── __init__.py
├── tests
    ├── __init__.py
    ├── run.sh
    ├── util.py
    ├── test_schema.py
    ├── test_grammar.py
    ├── test_gbdhash.py
    ├── test_initializer.py
    ├── test_db_nonunique_features.py
    ├── test_querybuilder.py
    ├── test_db_unique_features.py
    └── test_api.py
├── gbd_core
    ├── __init__.py
    ├── config.py
    ├── contexts.py
    ├── util_argparse.py
    ├── util.py
    ├── query.py
    ├── grammar.py
    ├── api.py
    ├── database.py
    └── schema.py
├── gbd_init
    ├── __init__.py
    ├── gbdhash.py
    ├── initializer.py
    ├── instance_transformers.py
    └── feature_extractors.py
├── gbd_server
    ├── __init__.py
    ├── static
    │   ├── img
    │   │   ├── gbd_logo.jpg
    │   │   ├── gbd_logo.png
    │   │   └── gbd_logo_small.png
    │   ├── main.css
    │   └── w3.js
    ├── templates
    │   └── index.html
    └── server.py
├── update_tool.sh
├── MANIFEST.in
├── docker
    ├── build.sh
    ├── Dockerfile.gbd
    ├── Dockerfile.nginx
    ├── entrypoint.nginx.sh
    ├── docker-compose.yml
    └── configs
    │   ├── nginx.conf
    │   └── nginx.https.conf
├── CITATION.cff
├── .gitignore
├── default_config.toml
├── .gitattributes
├── pyproject.toml
├── setup.py.backup
├── LICENSE
├── .gitlab-ci.yml
├── .github
    └── workflows
    │   ├── gh-pages-apidoc.yml
    │   └── docker-image.yml
├── README.md
└── gbd.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gbd_core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gbd_init/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gbd_server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/run.sh:
--------------------------------------------------------------------------------
1 | PYTHONPATH=.. python3 -m unittest *.py
2 | 


--------------------------------------------------------------------------------
/gbd_server/static/img/gbd_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Udopia/gbd/HEAD/gbd_server/static/img/gbd_logo.jpg


--------------------------------------------------------------------------------
/gbd_server/static/img/gbd_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Udopia/gbd/HEAD/gbd_server/static/img/gbd_logo.png


--------------------------------------------------------------------------------
/gbd_server/static/img/gbd_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Udopia/gbd/HEAD/gbd_server/static/img/gbd_logo_small.png


--------------------------------------------------------------------------------
/update_tool.sh:
--------------------------------------------------------------------------------
1 | sudo rm -rf dist/
2 | # sudo python3 setup.py develop sdist bdist_wheel
3 | # twine upload dist/*
4 | sudo python3 -m pip install --upgrade build twine
5 | sudo python3 -m build
6 | twine upload dist/*
7 | sudo rm -Rf gbd_tools.egg-info dist build
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE
 3 | include gbd_server/templates/*.html
 4 | include gbd_server/static/img/*.png
 5 | include gbd_server/static/img/*.jpg
 6 | include gbd_server/static/*.css
 7 | include gbd_server/static/*.js
 8 | include default_config.toml
 9 | 
10 | 


--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 1 ]; then
 4 | 	echo "Usage: $0 [ nginx | gbd | ... ]"
 5 | 	exit 0
 6 | fi
 7 | 
 8 | if [ ! -e Dockerfile.$1 ]; then
 9 | 	echo "Dockerfile.$1 not found"
10 | 	exit 1
11 | fi
12 | 
13 | docker build --no-cache -t my$1 -f Dockerfile.$1 .
14 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.gbd:
--------------------------------------------------------------------------------
 1 | FROM python:slim
 2 | 
 3 | #ENV GBD_DB=/gbd/meta.db:/gbd/base.db:/gbd/gate.db
 4 | ENV GBD_LOGS=/raid/gbd/logs
 5 | ENV GBD_PORT=44071
 6 | 
 7 | RUN apt-get update -y \
 8 |   && apt-get install -y wget \
 9 |   && apt-get clean \
10 |   && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
11 | 
12 | RUN pip install --no-cache-dir gbd-tools
13 | 
14 | WORKDIR /gbd
15 | 
16 | EXPOSE 44071
17 | 
18 | ENTRYPOINT [ "gbd", "serve" ]
19 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Iser"
 5 |   given-names: "Ashlin"
 6 |   orcid: "https://orcid.org/0000-0003-2904-232X"
 7 | - family-names: "Jabs"
 8 |   given-names: "Christoph"
 9 |   orcid: "https://orcid.org/0000-0003-3532-696X"
10 | title: "GBD Tools"
11 | version: 4.7.0
12 | doi: 10.5281/zenodo.10213944
13 | date-released: 2023-11-28
14 | url: "https://github.com/Udopia/gbd"
15 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.nginx:
--------------------------------------------------------------------------------
 1 | FROM nginx:alpine
 2 | 
 3 | ENV VIRTUAL_HOST=localhost
 4 | ENV AWSTATS_USER=statsuser
 5 | ENV AWSTATS_PASS=stats1234
 6 | 
 7 | RUN apk add --no-cache bash awstats apache2-utils
 8 | 
 9 | WORKDIR /awstats
10 | RUN mkdir -p /awstats/www
11 | 
12 | COPY configs/nginx.https.conf /etc/nginx/nginx.conf
13 | COPY entrypoint.nginx.sh /entrypoint.nginx.sh
14 | COPY configs/awstats.conf /etc/awstats/awstats.conf
15 | 
16 | EXPOSE 80
17 | 
18 | CMD [ "/entrypoint.nginx.sh" ]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.db
 2 | *.xls
 3 | *.pyc
 4 | 
 5 | # Compiled class file
 6 | *.class
 7 | 
 8 | # Log file
 9 | *.log
10 | 
11 | # BlueJ files
12 | *.ctxt
13 | 
14 | # Mobile Tools for Java (J2ME)
15 | .mtj.tmp/
16 | 
17 | # Package Files #
18 | *.jar
19 | *.war
20 | *.nar
21 | *.ear
22 | *.zip
23 | *.tar.gz
24 | *.rar
25 | 
26 | # individual files and folders
27 | gbd_server/gbd-server-log*
28 | cli_config/*
29 | server/server_config/*
30 | */default_config/*
31 | server/cache/*
32 | .vscode/
33 | gbd-server-logs/*
34 | .eggs/*
35 | build/
36 | dist/
37 | *.egg-info/
38 | 


--------------------------------------------------------------------------------
/docker/entrypoint.nginx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Configures nginx and starts it
 4 | sed -i "s/__VIRTUAL_HOST__/$VIRTUAL_HOST/g" /etc/nginx/nginx.conf
 5 | 
 6 | nginx
 7 | 
 8 | 
 9 | # Configures awstats user and password
10 | htpasswd -cb /awstats/htpasswd $AWSTATS_USER $AWSTATS_PASS
11 | 
12 | 
13 | # Configures cron and starts it
14 | /usr/bin/awstats_buildstaticpages.pl -config=$VIRTUAL_HOST -update -dir=/awstats/www
15 | printf "#!/bin/bash\n/usr/bin/awstats_buildstaticpages.pl -config=$VIRTUAL_HOST -update -dir=/awstats/www" > /etc/periodic/15min/awstats
16 | chmod +x /etc/periodic/15min/awstats
17 | ln -fs /usr/share/zoneinfo/Europe/Berlin /etc/localtime
18 | 
19 | crond -f -l 8


--------------------------------------------------------------------------------
/default_config.toml:
--------------------------------------------------------------------------------
 1 | [contexts]
 2 | default = "cnf"
 3 | cnf = { suffix = ".cnf", idfunc = "cnf_hash", description = "DIMACS Conjunctive Normal Form (CNF)" }
 4 | sancnf = { suffix = ".sanitized.cnf", idfunc = "cnf_hash", description = "Sanitized CNF" }
 5 | kis = { suffix = ".kis", idfunc = "cnf_hash", description = "k-Independent Set Problem Graph" }
 6 | opb = { suffix = ".opb", idfunc = "opb_hash", description = "Pseudo-Boolean Optimization Problem" }
 7 | wcnf = { suffix = ".wcnf", idfunc = "wcnf_hash", description = "Weighted CNF (WCNF)" }
 8 | wecnf = { suffix = ".wecnf", idfunc = "cnf_hash", description = "Weighted Extended CNF (WECNF)" }
 9 | 
10 | [transformers]
11 | 
12 | [extractors]


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * 	 text=auto eol=lf
 3 | 
 4 | *.cs     text diff=csharp
 5 | *.java   text diff=java
 6 | *.html   text diff=html
 7 | *.css    text
 8 | *.js     text
 9 | *.sql    text
10 | 
11 | *.csproj text merge=union
12 | *.sln    text merge=union eol=lf
13 | 
14 | *.docx   diff=astextplain
15 | *.DOCX   diff=astextplain
16 | 
17 | # absolute paths are ok, as are globs
18 | /**/postinst* text eol=lf
19 | 
20 | # paths that don't start with / are treated relative to the .gitattributes folder
21 | relative/path/*.txt text eol=lf
22 | 
23 | *.png binary
24 | *.jpg binary
25 | *.jpeg binary
26 | *.gif binary
27 | *.ico binary
28 | 
29 | gbd_server/static/css/* linguist-vendored
30 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   nginx:
 4 |     image: udopia/nginx
 5 |     #image: mynginx
 6 |     environment:
 7 |     - VIRTUAL_HOST=benchmark-database.de
 8 |     - AWSTATS_USER=statsuser
 9 |     - AWSTATS_PASS=stats1234
10 |     ports:
11 |     - 80:80
12 |     - 443:443
13 |     restart: always
14 |     volumes:
15 |     - /home/iser/nginx/ssl:/etc/nginx/ssl
16 |     - /home/iser/nginx/ssl/bot:/etc/nginx/ssl/bot
17 | 
18 |   gbd:
19 |     depends_on:
20 |     - nginx
21 |     image: udopia/gbd
22 |     #image: mygbd
23 |     #environment:
24 |     #- GBD_DB=/gbd/meta.db:/gbd/base.db:/gbd/gate.db
25 |     ports:
26 |     - 44071:44071
27 |     volumes:
28 |     - /home/iser/gbd:/raid/gbd:ro
29 |     restart: always


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=42", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "gbd_tools"
 7 | version = "5.0.1"
 8 | description = "GBD Tools: Maintenance and Distribution of Benchmark Instances and their Attributes"
 9 | readme = "README.md"
10 | license-files = ["LICENSE"]
11 | requires-python = ">=3.6"
12 | authors = [{ name = "Ashlin Iser", email = "iser@kit.edu" }]
13 | urls = { Homepage = "https://github.com/Udopia/gbd" }
14 | classifiers = ["Programming Language :: Python :: 3"]
15 | dependencies = ["flask", "tatsu", "polars", "waitress", "pebble", "gbdc"]
16 | scripts = { gbd = "gbd:main" }
17 | 
18 | [tool.setuptools]
19 | include-package-data = true
20 | py-modules = ["gbd"]
21 | packages = ["gbd_core", "gbd_init", "gbd_server"]
22 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | 
 4 | def get_random_clause(max_len=10, max_vars=30):
 5 |     return ' '.join([str(random.randint(-max_vars, max_vars)) for _ in range(random.randint(0, max_len))]) + ' 0'
 6 | 
 7 | def get_random_formula(max_num=50):
 8 |     return '\n'.join([get_random_clause() for _ in range(random.randint(0, max_num))]) + '\n'
 9 | 
10 | def get_random_unique_filename(prefix='random', suffix='.cnf'):
11 |     filename = prefix + suffix
12 |     while os.path.exists(filename):
13 |         filename = '{}{}{}'.format(prefix, random.randint(0, 1000), suffix)
14 |     return filename
15 | 
16 | def get_random_cnffile(max_num=50):
17 |     filename = get_random_unique_filename()
18 |     with open(filename, 'w') as f:
19 |         f.write('p cnf {} {}\n'.format(random.randint(1, 100), random.randint(1, 100)))
20 |         f.write(get_random_formula(max_num))
21 |     return filename


--------------------------------------------------------------------------------
/setup.py.backup:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='gbd_tools',
 4 |   version='4.9.8',
 5 |   description='GBD Tools: Maintenance and Distribution of Benchmark Instances and their Attributes',
 6 |   long_description=open('README.md', 'rt').read(),
 7 |   long_description_content_type="text/markdown",
 8 |   url='https://github.com/Udopia/gbd',
 9 |   author='Ashlin Iser',
10 |   author_email='markus.iser@kit.edu',
11 |   packages=[
12 |     "gbd_core", 
13 |     "gbd_init",
14 |     "gbd_server"
15 |   ],
16 |   scripts=[
17 |     "gbd.py"
18 |   ],
19 |   include_package_data=True,
20 |   setup_requires=[
21 |     'wheel',
22 |     'setuptools'
23 |   ],
24 |   install_requires=[
25 |     'flask',
26 |     'tatsu',
27 |     'pandas',
28 |     'waitress',
29 |     'pebble',
30 |     'gbdc'
31 |   ],
32 |   install_obsoletes=['global-benchmark-database-tool'],
33 |   classifiers=[
34 |     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
35 |     "Programming Language :: Python :: 3"
36 |   ],
37 |   entry_points={
38 |     "console_scripts": [
39 |         "gbd = gbd:main"
40 |     ]
41 |   }
42 | )
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 | - build-image
 3 | - tag-image
 4 | 
 5 | 
 6 | variables:
 7 |   BUILD_IMAGE: qpr-registry.iti.kit.edu/gbd/gbd:$CI_COMMIT_SHA
 8 |   RELEASE_IMAGE: qpr-registry.iti.kit.edu/gbd/gbd:latest
 9 | 
10 | build-image:
11 |   image: docker:stable
12 |   services:
13 |     - docker:dind
14 |   stage: build-image
15 |   tags:
16 |     - docker
17 |   rules:
18 |     - if: '$CI_COMMIT_BRANCH == "master"'
19 |     - if: '$CI_COMMIT_BRANCH == "develop"'
20 |   before_script:
21 |     - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
22 |   after_script:
23 |     - docker logout $CI_REGISTRY
24 |   script:
25 |     - docker build -t $BUILD_IMAGE .
26 |     - docker push $BUILD_IMAGE
27 | 
28 | tag-image:
29 |   image: docker:stable
30 |   services:
31 |     - docker:dind
32 |   stage: tag-image
33 |   tags:
34 |     - docker
35 |   rules:
36 |     - if: '$CI_COMMIT_BRANCH == "master"'
37 |   before_script:
38 |     - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
39 |   after_script:
40 |     - docker logout $CI_REGISTRY
41 |   script:
42 |     - docker pull $BUILD_IMAGE
43 |     - docker tag $BUILD_IMAGE $RELEASE_IMAGE
44 |     - docker push $RELEASE_IMAGE
45 | 
46 | 


--------------------------------------------------------------------------------
/docker/configs/nginx.conf:
--------------------------------------------------------------------------------
 1 | events {}
 2 | 
 3 | http {
 4 | 	include /etc/nginx/mime.types;
 5 | 	default_type application/octet-stream;
 6 | 	sendfile on;
 7 | 
 8 | 	proxy_http_version 1.1;
 9 | 	proxy_buffering off;
10 | 	proxy_set_header Host $host;
11 | 	proxy_set_header Upgrade $http_upgrade;
12 | 	proxy_set_header Connection "Upgrade";
13 | 	proxy_set_header X-Real-IP $remote_addr;
14 | 	proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
15 | 	proxy_set_header X-Forwarded-Host $host;
16 | 	proxy_set_header X-Forwarded-Port $server_port;
17 | 	proxy_set_header Proxy "";
18 | 	log_format combined_ip '$http_x_forwarded_for - $remote_user [$time_local] '
19 | 			'"$request" $status $body_bytes_sent '
20 | 			'"$http_referer" "$http_user_agent"';
21 | 
22 | 	server {
23 | 		listen 80;
24 | 		access_log /awstats/access.log combined_ip;
25 | 
26 | 		location / {
27 | 			#proxy_pass   http://127.0.0.1:44071;
28 | 			proxy_pass   http://gbd:44071;
29 | 		}
30 | 
31 | 		location /stats {
32 | 			alias /awstats/www;
33 | 			index awstats.__VIRTUAL_HOST__.html index.html;
34 | 			try_files $uri $uri/ index.html;
35 | 			auth_basic "Restricted area";
36 | 			auth_basic_user_file /awstats/htpasswd;
37 | 			access_log off;
38 | 
39 | 			location /stats/icon {
40 | 				alias /usr/lib/awstats/icon;
41 | 			}
42 | 		}
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/.github/workflows/gh-pages-apidoc.yml:
--------------------------------------------------------------------------------
 1 | name: apidocs
 2 | on:
 3 |   push:
 4 |     branches: [ "main" ]
 5 | 
 6 | jobs:
 7 |   deploy:
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@master
12 |     - name: Set up Python 3.8
13 |       uses: actions/setup-python@v2
14 |       with:
15 |         python-version: 3.8
16 | 
17 |     - name: Install requirements for documentation generation
18 |       run: |
19 |         python -m pip install --upgrade pip setuptools wheel
20 |         python -m pip install docutils pydoctor
21 | 
22 |     - name: Generate API documentation with pydoctor
23 |       run: |
24 | 
25 |         # Run pydoctor build
26 |         pydoctor \
27 |             --project-name=gbd \
28 |             --project-url=https://github.com/$GITHUB_REPOSITORY \
29 |             --html-viewsource-base=https://github.com/$GITHUB_REPOSITORY/tree/$GITHUB_SHA \
30 |             --make-html \
31 |             --html-output=./apidocs \
32 |             --project-base-dir="$(pwd)" \
33 |             --docformat=restructuredtext \
34 |             --intersphinx=https://docs.python.org/3/objects.inv \
35 |             ./gbd_core
36 | 
37 |     - name: Push API documentation to Github Pages
38 |       uses: peaceiris/actions-gh-pages@v3
39 |       with:
40 |         github_token: ${{ secrets.GITHUB_TOKEN }}
41 |         publish_dir: ./apidocs
42 |         commit_message: "Generate API documentation"


--------------------------------------------------------------------------------
/gbd_core/config.py:
--------------------------------------------------------------------------------
 1 | import toml
 2 | import os
 3 | import importlib.resources as pkg_resources
 4 | 
 5 | ### Default Context
 6 | default = "cnf"
 7 | 
 8 | ### Load Configuration from Files
 9 | def load_config(default_config_path, user_config_path=None):
10 |     # Load the default configuration file
11 |     with pkg_resources.open_text('gbd_tools', default_config_path) as f:
12 |         config = toml.load(f)
13 |     
14 |     # If a user configuration file is provided, load it and update the config
15 |     if user_config_path and os.path.exists(user_config_path):
16 |         with open(user_config_path, 'r') as f:
17 |             user_config = toml.load(f)
18 |         config.update(user_config)
19 |     
20 |     return config
21 | 
22 | ### Convert ConfigParser to Dictionary
23 | def config_to_dict(config):
24 |     config_dict = {}
25 |     for context, details in config['contexts'].items():
26 |         config_dict[context] = {
27 |             "description": details["description"],
28 |             "suffix": details["suffix"],
29 |             "idfunc": globals()[details["idfunc"]],
30 |         }
31 |     return config_dict
32 | 
33 | ### Paths to Configuration Files
34 | default_config_path = "default_config.toml"
35 | user_config_path = "user_config.toml"  # Adjust this path as needed
36 | 
37 | ### Load and Convert Configuration
38 | config_parser = load_config(default_config_path, user_config_path)
39 | config = config_to_dict(config_parser)


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 | #  release:
 5 | #    types: [ published ]
 6 |   push:
 7 |     branches: [ "main" ]
 8 | #  pull_request:
 9 | #    branches: [ "main" ]
10 | 
11 | jobs:
12 | 
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |         - uses: actions/checkout@v1
19 |         - name: Login to DockerHub Registry
20 |           run: echo "${{ secrets.DOCKERHUB_PASSWORD }}" | docker login -u udopia --password-stdin
21 | #        - name: Get the version
22 | #          id: vars
23 | #          run: echo ::set-output name=tag::$(echo ${GITHUB_REF:10})
24 |         - name: Build the tagged Docker GBD image
25 |           run: docker build docker/ --file docker/Dockerfile.gbd --tag udopia/gbd:${{ github.sha }}
26 |         - name: Push the tagged Docker GBD image
27 |           run: docker push udopia/gbd:${{ github.sha }}
28 |         - name: Build the latest Docker GBD image
29 |           run: docker build docker/ --file docker/Dockerfile.gbd --tag udopia/gbd:latest
30 |         - name: Push the latest Docker GBD image
31 |           run: docker push udopia/gbd:latest
32 |         - name: Build the tagged Docker NGINX image
33 |           run: docker build docker/ --file docker/Dockerfile.nginx --tag udopia/nginx:${{ github.sha }}
34 |         - name: Push the tagged Docker NGINX image
35 |           run: docker push udopia/nginx:${{ github.sha }}
36 |         - name: Build the latest Docker NGINX image
37 |           run: docker build docker/ --file docker/Dockerfile.nginx --tag udopia/nginx:latest
38 |         - name: Push the latest Docker NGINX image
39 |           run: docker push udopia/nginx:latest
40 | 
41 | 


--------------------------------------------------------------------------------
/docker/configs/nginx.https.conf:
--------------------------------------------------------------------------------
 1 | events {}
 2 | 
 3 | http {
 4 | 	include /etc/nginx/mime.types;
 5 | 	default_type application/octet-stream;
 6 | 	sendfile on;
 7 | 
 8 | 	proxy_http_version 1.1;
 9 | 	proxy_buffering off;
10 | 	proxy_set_header Host $host;
11 | 	proxy_set_header Upgrade $http_upgrade;
12 | 	proxy_set_header Connection "Upgrade";
13 | 	proxy_set_header X-Real-IP $remote_addr;
14 | 	proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
15 | 	proxy_set_header X-Forwarded-Host $host;
16 | 	proxy_set_header X-Forwarded-Port $server_port;
17 | 	proxy_set_header Proxy "";
18 | 	log_format combined_ip '$http_x_forwarded_for - $remote_user [$time_local] '
19 | 			'"$request" $status $body_bytes_sent '
20 | 			'"$http_referer" "$http_user_agent"';
21 | 
22 | 	server {
23 | 		listen 80;
24 | 		listen [::]:80;
25 | 		access_log /awstats/access.log combined_ip;
26 | 
27 | 		location ^~ /.well-known {
28 |             root /etc/nginx/ssl/bot;
29 |         }
30 | 
31 | 		location / {
32 | 			return 301 https://$host$request_uri;
33 | 		}
34 | 	}
35 | 
36 | 	server {
37 | 		listen 443 ssl http2;
38 | 		listen [::]:443 ssl http2;
39 | 		access_log /awstats/access.log combined_ip;
40 | 
41 | 		ssl_certificate     /etc/nginx/ssl/fullchain.pem;
42 |         ssl_certificate_key /etc/nginx/ssl/privkey.pem;
43 | 
44 | 		location / {
45 | 			proxy_pass   http://gbd:44071;
46 | 		}
47 | 
48 | 		location /stats {
49 | 			alias /awstats/www;
50 | 			index awstats.__VIRTUAL_HOST__.html index.html;
51 | 			try_files $uri $uri/ index.html;
52 | 			auth_basic "Restricted area";
53 | 			auth_basic_user_file /awstats/htpasswd;
54 | 			access_log off;
55 | 
56 | 			location /stats/icon {
57 | 				alias /usr/lib/awstats/icon;
58 | 			}
59 | 		}
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/tests/test_schema.py:
--------------------------------------------------------------------------------
 1 | ##!/usr/bin/python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import unittest
 6 | import sqlite3
 7 | 
 8 | from gbd_core.database import Database
 9 | from gbd_core.schema import Schema
10 | 
11 | from tests import util
12 | 
13 | class SchemaTestCase(unittest.TestCase):
14 | 
15 |     def setUp(self) -> None:
16 |         self.file = util.get_random_unique_filename('test', '.db')
17 |         sqlite3.connect(self.file).close()
18 |         self.name = Schema.dbname_from_path(self.file)
19 |         self.db = Database([self.file], verbose=False)
20 |         return super().setUp()
21 | 
22 |     def tearDown(self) -> None:
23 |         if os.path.exists(self.file):
24 |             os.remove(self.file)
25 |         return super().tearDown()
26 | 
27 |     def test_create_db(self):
28 |         self.assertTrue(Schema.is_database(self.file))
29 |         self.assertEqual(len(self.db.get_databases()), 1)
30 |         self.assertTrue(self.db.dexists(self.name))
31 |         
32 |     def test_create_unique_feature(self):
33 |         FEAT = "featA"
34 |         self.db.create_feature(FEAT, default_value="empty")
35 |         self.assertIn(FEAT, self.db.get_features())
36 |         self.assertIn("features", self.db.get_tables())
37 |         finfo = self.db.find(FEAT)
38 |         self.assertEqual(finfo.table, "features")
39 |         self.assertEqual(finfo.column, FEAT)
40 |         self.assertEqual(finfo.default, "empty")
41 |         self.assertEqual(finfo.database, self.name)
42 |         
43 |     def test_create_nonunique_feature(self):
44 |         FEAT = "featB"
45 |         self.db.create_feature(FEAT, default_value=None)
46 |         self.assertIn(FEAT, self.db.get_features())
47 |         self.assertIn("features", self.db.get_tables())
48 |         finfo = self.db.find(FEAT)
49 |         self.assertEqual(finfo.table, FEAT)
50 |         self.assertEqual(finfo.column, "value")
51 |         self.assertEqual(finfo.default, None)
52 |         self.assertEqual(finfo.database, self.name)
53 | 
54 | 


--------------------------------------------------------------------------------
/gbd_server/static/main.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |   --background-color-1: #FFD497;
  3 | 	--background-color-2: #E8E6EF; 
  4 | 	--text-color: #000000;
  5 | 	--link-color-1: #823329;
  6 | 	--link-color-2: #000000;
  7 | 	--border-color: #274060;
  8 | }
  9 | 
 10 | html, body {
 11 |   height: 100%;
 12 |   margin: 0;
 13 | }
 14 | 
 15 | ul li {
 16 |   padding: 0.3em;
 17 | }
 18 | 
 19 | .main {
 20 |   width: 100%;
 21 |   height: 100%;
 22 |   display: flex;
 23 |   flex-direction: column;
 24 |   flex-wrap: nowrap;
 25 |   background:var(--background-color-1);
 26 |   color:var(--text-color);
 27 |   font-family:Arial,Helvetica,sans-serif;
 28 | }
 29 | 
 30 | header {
 31 |   flex-shrink: 0;
 32 |   border-bottom: 3px solid var(--border-color);
 33 | }
 34 | 
 35 | header > img {
 36 |   float: left;
 37 |   margin: 1em;
 38 |   filter: drop-shadow(-11px -7px 3px var(--border-color));
 39 | }
 40 | 
 41 | header > .form {
 42 |   float: left;
 43 |   margin: 1em;
 44 | }
 45 | 
 46 | header > .help {
 47 |   padding: 1em;
 48 | }
 49 | 
 50 | header > .help > fieldset > ul {
 51 |   margin: 0em;
 52 | }
 53 | 
 54 | fieldset {
 55 |   border: 2px solid var(--border-color);
 56 | }
 57 | 
 58 | .content {
 59 |   flex-grow: 1;
 60 |   overflow: auto;
 61 |   min-height: 2em;
 62 |   background:var(--background-color-2);
 63 |   padding: 2px;
 64 | }
 65 | 
 66 | footer {
 67 |   padding: 1em;
 68 |   flex-shrink: 0;
 69 |   border-top: 3px solid var(--border-color);
 70 | }
 71 | 
 72 | a:link { color:var(--link-color-1); text-decoration:none; }
 73 | a:visited { color:var(--link-color-1); text-decoration:none; }
 74 | a:hover { color:var(--link-color-2); }
 75 | a.active { color:var(--link-color-2); }
 76 | 
 77 | input.query {
 78 | 	padding: 4px 8px;
 79 | 	margin: 12px 4px;
 80 | 	width: 600px;
 81 | }
 82 | 
 83 | select.features {
 84 | 	padding: 4px 8px;
 85 | 	margin: 12px 4px;
 86 | 	width: 600px;
 87 | }
 88 | 
 89 | button.submit {
 90 | 	padding: 4px 8px;
 91 | 	margin: 12px 4px;
 92 | }
 93 | 
 94 | table, th, td {
 95 |   border:1px solid var(--border-color);
 96 |   border-collapse: collapse;
 97 |   padding: 3px 7px;
 98 | }
 99 | 
100 | td {
101 |   vertical-align: top;
102 |   font-family: monospace;
103 | }
104 | 
105 | table th {
106 | 	background-color:var(--background-color-2);
107 | 	font-weight: bold;
108 | 	cursor: default;
109 | }
110 | 
111 | table.sortable th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after { 
112 | 	content: " \25B4\25BE" 
113 | }
114 | 


--------------------------------------------------------------------------------
/tests/test_grammar.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | 
 4 | from gbd_core.grammar import Parser, ParserException
 5 | 
 6 | class SchemaTestCase(unittest.TestCase):
 7 | 
 8 |     def setUp(self) -> None:
 9 |         return super().setUp()
10 | 
11 |     def tearDown(self) -> None:
12 |         return super().tearDown()
13 | 
14 |     def test_query_nesting(self):
15 |         parser = Parser("a = 1")
16 |         self.assertEqual(parser.get_features(), set(["a"]))
17 |         parser = Parser("a = 1 and b = 2")
18 |         self.assertEqual(parser.get_features(), set(["a", "b"]))
19 |         parser = Parser("a = 1 and (b = 2 or c = 3)")
20 |         self.assertEqual(parser.get_features(), set(["a", "b", "c"]))
21 |         parser = Parser("(b = 2 or c = 3) and a = 1")
22 |         self.assertEqual(parser.get_features(), set(["a", "b", "c"]))
23 | 
24 |     def test_query_string_constraints(self):
25 |         parser = Parser("a = val1")
26 |         self.assertEqual(parser.get_features(), set(["a"]))
27 |         parser = Parser("a = val1 and b != val2")
28 |         self.assertEqual(parser.get_features(), set(["a", "b"]))
29 |         parser = Parser("a like val1")
30 |         self.assertEqual(parser.get_features(), set(["a"]))
31 |         parser = Parser("a like val%")
32 |         self.assertEqual(parser.get_features(), set(["a"]))
33 |         parser = Parser("a like %val")
34 |         self.assertEqual(parser.get_features(), set(["a"]))
35 |         parser = Parser("a like %val%")
36 |         self.assertEqual(parser.get_features(), set(["a"]))
37 |         parser = Parser("a like val% and b unlike val%")
38 |         self.assertEqual(parser.get_features(), set(["a", "b"]))        
39 |         with self.assertRaises(ParserException):
40 |             parser = Parser("a = %val%")
41 | 
42 |     def test_query_arithmetic_constraints(self):
43 |         parser = Parser("a = (1 + 2)")
44 |         self.assertEqual(parser.get_features(), set(["a"]))
45 |         parser = Parser("a = (1 - 2)")
46 |         self.assertEqual(parser.get_features(), set(["a"]))
47 |         parser = Parser("a = ((1 + 2) / b)")
48 |         self.assertEqual(parser.get_features(), set(["a", "b"]))
49 |         parser = Parser("a = (b)")
50 |         self.assertEqual(parser.get_features(), set(["a", "b"]))
51 |         parser = Parser("a = b")
52 |         self.assertEqual(parser.get_features(), set(["a"]))
53 | 
54 |     def test_explicit_context(self):
55 |         parser = Parser("c:a = 1")
56 |         self.assertEqual(parser.get_features(), set(["c:a"]))
57 |         parser = Parser("c:a = 1 and d:b = 2")
58 |         self.assertEqual(parser.get_features(), set(["c:a", "d:b"]))


--------------------------------------------------------------------------------
/tests/test_gbdhash.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import random
 3 | import os
 4 | 
 5 | from gbd_core.contexts import identify
 6 | from tests import util
 7 | 
 8 | 
 9 | class TestGBDHash(unittest.TestCase):
10 | 
11 |     def setUp(self):
12 |         self.reference = util.get_random_formula()
13 |         self.ref_file = "reference.cnf"
14 |         with open(self.ref_file, 'w') as ref:
15 |             ref.write(self.reference)
16 |         self.reference_hash = identify(self.ref_file)
17 | 
18 |     def tearDown(self):
19 |         if self.currentResult.wasSuccessful():
20 |             os.remove(self.ref_file)
21 | 
22 |     def run(self, result=None):
23 |         self.currentResult = result
24 |         unittest.TestCase.run(self, result)
25 | 
26 |     def get_random_character(self):
27 |         c = chr(random.randint(0, 255))
28 |         return c if not c.isspace() else ' '
29 | 
30 |     def get_random_string(self, min_length=0, max_length=20):
31 |         return ''.join([self.get_random_character() for _ in range(random.randint(min_length, max_length))])
32 | 
33 |     def get_random_whitespace_character(self):
34 |         r = random.random()
35 |         return '\t' if r < 0.25 else '\r' if r < 0.5 else '\n' if r < 0.75 else ' '
36 | 
37 |     def get_random_whitespace(self, min_length=0, max_length=3):
38 |         return ''.join([self.get_random_whitespace_character() for _ in range(random.randint(min_length, max_length))])
39 |     
40 |     def get_random_header(self, p=0.5):
41 |         return "p cnf {} {}\n".format(random.randint(1, 100), random.randint(1, 100)) if random.random() < p else ""
42 | 
43 |     def get_random_comment(self, p=0.5):
44 |         return "c {}\n".format(self.get_random_string()) if random.random() < p else ""
45 | 
46 |     def test_randomized_variants(self):
47 |         for _ in range(100):
48 |             variant = self.get_random_whitespace()
49 |             variant += self.get_random_comment()
50 |             variant += self.get_random_header()
51 |             variant += self.get_random_whitespace()
52 |             for c in self.reference:
53 |                 if c.isspace():
54 |                     variant += self.get_random_whitespace()
55 |                 variant += c
56 |                 if c.isspace():
57 |                     variant += self.get_random_whitespace()
58 |             variant += self.get_random_whitespace()
59 | 
60 |             var_file = "variant.cnf"
61 |             with open(var_file, 'w') as f:
62 |                 f.write(variant)
63 |             variant_hash = identify(var_file)
64 |             if self.reference_hash == variant_hash:
65 |                 os.remove(var_file)
66 | 
67 |             self.assertEqual(self.reference_hash, variant_hash)
68 | 


--------------------------------------------------------------------------------
/gbd_core/contexts.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | from gbd_init.gbdhash import cnf_hash, opb_hash, wcnf_hash
16 | 
17 | ### Default Context
18 | default = "cnf"
19 | 
20 | ### Configuration of Available Contexts
21 | config = {
22 |     "cnf": {
23 |         "description": "Conjunctive Normal Form (CNF) in DIMACS format",
24 |         "suffix": ".cnf",
25 |         "idfunc": cnf_hash,
26 |     },
27 |     "sancnf": {
28 |         "description": "Sanitized Conjunctive Normal Form (CNF) in DIMACS format",
29 |         "suffix": ".sanitized.cnf",
30 |         "idfunc": cnf_hash,
31 |     },
32 |     "kis": {
33 |         "description": "k-Independent Set (KIS) in DIMACS-like graph format",
34 |         "suffix": ".kis",
35 |         "idfunc": cnf_hash,
36 |     },
37 |     "opb": {
38 |         "description": "Pseudo-Boolean Optimization Problem in OPB format",
39 |         "suffix": ".opb",
40 |         "idfunc": opb_hash,
41 |     },
42 |     "wecnf": {
43 |         "description": "Weighted Extended Conjunctive Normal Form (WECNF)",
44 |         "suffix": ".wecnf",
45 |         "idfunc": cnf_hash,
46 |     },
47 |     "wcnf": {
48 |         "description": "MaxSAT instances in WCNF format",
49 |         "suffix": ".wcnf",
50 |         "idfunc": wcnf_hash,
51 |     },
52 | }
53 | 
54 | 
55 | def description(context):
56 |     return config[context]["description"]
57 | 
58 | 
59 | def suffixes(context):
60 |     return [config[context]["suffix"] + p for p in ["", ".gz", ".lzma", ".xz", ".bz2"]]
61 | 
62 | 
63 | def idfunc(context):
64 |     return config[context]["idfunc"]
65 | 
66 | 
67 | def contexts():
68 |     return config.keys()
69 | 
70 | 
71 | def default_context():
72 |     return default
73 | 
74 | 
75 | def get_context_by_suffix(benchmark):
76 |     for context in contexts():
77 |         for suffix in suffixes(context):
78 |             if benchmark.endswith(suffix):
79 |                 return context
80 |     return None
81 | 
82 | 
83 | def identify(path, ct=None):
84 |     context = ct or get_context_by_suffix(path)
85 |     if context is None:
86 |         raise Exception("Unable to associate context: " + path)
87 |     else:
88 |         idf = idfunc(context)
89 |         return idf(path)
90 | 


--------------------------------------------------------------------------------
/gbd_init/gbdhash.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | import io
16 | import hashlib
17 | 
18 | import gzip
19 | import bz2
20 | import lzma
21 | 
22 | 
23 | def open_file(filename, mode):
24 |     if filename.endswith(".gz"):
25 |         return gzip.open(filename, mode)
26 |     elif filename.endswith(".bz2"):
27 |         return bz2.open(filename, mode)
28 |     elif filename.endswith(".lzma") or filename.endswith(".xz"):
29 |         return lzma.open(filename, mode)
30 |     else:
31 |         return open(filename, mode)
32 | 
33 | 
34 | try:
35 |     from gbdc import opbhash as opb_hash
36 | except ImportError:
37 | 
38 |     def opb_hash(filename):
39 |         raise Exception("Unable to import opbhash. Please install or update gbdc: https://github.com/Udopia/gbdc")
40 | 
41 | 
42 | try:
43 |     from gbdc import gbdhash as cnf_hash
44 | except ImportError:
45 |     try:
46 |         from gbdhashc import gbdhash as cnf_hash
47 |     except ImportError:
48 | 
49 |         def cnf_hash(filename):
50 |             file = open_file(filename, "rb")
51 |             buff = io.BufferedReader(file, io.DEFAULT_BUFFER_SIZE * 16)
52 | 
53 |             space = False
54 |             skip = False
55 |             start = True
56 |             cldelim = True
57 |             hash_md5 = hashlib.md5()
58 | 
59 |             for byte in iter(lambda: buff.read(1), b""):
60 |                 if not skip and (byte >= b"0" and byte <= b"9" or byte == b"-"):
61 |                     cldelim = byte == b"0" and (space or start)
62 |                     start = False
63 |                     if space:
64 |                         space = False
65 |                         hash_md5.update(b" ")
66 |                     hash_md5.update(byte)
67 |                 elif byte <= b" ":
68 |                     space = not start  # remember non-leading space characters
69 |                     skip = skip and byte != b"\n" and byte != b"\r"  # comment line ended
70 |                 else:  # byte == b'c' or byte == b'p':
71 |                     skip = True  # do not hash comment and header line
72 | 
73 |             if not cldelim:
74 |                 hash_md5.update(b" 0")
75 | 
76 |             file.close()
77 | 
78 |             return hash_md5.hexdigest()
79 | 
80 | 
81 | try:
82 |     from gbdc import wcnfhash as wcnf_hash
83 | except ImportError:
84 | 
85 |     def wcnf_hash(filename):
86 |         raise Exception("Unable to import wcnfhash. Please install or update gbdc: https://github.com/Udopia/gbdc")
87 | 


--------------------------------------------------------------------------------
/tests/test_initializer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import unittest
 4 | import polars as pl
 5 | import random
 6 | import sqlite3
 7 | 
 8 | from gbd_core.database import Database
 9 | from gbd_core.schema import Schema
10 | from gbd_core.api import GBD, GBDException
11 | from gbd_init.initializer import Initializer
12 | from gbd_core.contexts import identify
13 | from gbd_init.feature_extractors import init_local, init_features_generic, generic_extractors
14 | 
15 | from tests import util
16 | 
17 | class InitTestCase(unittest.TestCase):
18 | 
19 |     def setUp(self) -> None:
20 |         self.file = util.get_random_unique_filename('test', '.db')
21 |         sqlite3.connect(self.file).close()
22 |         self.name = Schema.dbname_from_path(self.file)
23 |         self.db = Database([self.file], verbose=False)
24 |         self.benchmark = "benchmark.cnf"
25 |         self.dir = os.path.dirname(os.path.realpath(self.benchmark))
26 |         with open(self.benchmark, 'w') as file:            
27 |             file.write(util.get_random_formula(20))
28 |         self.reference_hash = identify(self.benchmark)
29 |         return super().setUp()
30 | 
31 |     def tearDown(self) -> None:
32 |         if os.path.exists(self.file):
33 |             os.remove(self.file)
34 |         if os.path.exists(self.benchmark):
35 |             os.remove(self.benchmark)
36 |         return super().tearDown()
37 | 
38 |     def init_random(self, hash, path, limits):
39 |         return [ ('random', hash, random.randint(1, 1000)) ]
40 | 
41 |     def test_init_random(self):
42 |         api = GBD([self.file], verbose=False)
43 |         rlimits = { 'jobs': 1, 'tlim': 5000, 'mlim': 2000, 'flim': 1000 }
44 |         init = Initializer(api, rlimits, self.name, [('random', 0)], self.init_random)
45 |         init.create_features()
46 |         self.assertTrue(api.feature_exists('random'))
47 |         df: pl.DataFrame  = pl.DataFrame([(str(n), None) for n in range(100)], schema=["hash", "local"], orient="row")
48 |         init.run(df)
49 |         df: pl.DataFrame = api.query("random > 0", [], ["random"])
50 |         self.assertEqual(len(df), 100)
51 | 
52 |     def test_init_local(self):
53 |         api = GBD([self.file], verbose=False)
54 |         rlimits = { 'jobs': 1, 'tlim': 5000, 'mlim': 2000, 'flim': 1000 }
55 |         init_local(api, rlimits, self.dir, self.name)
56 |         self.assertTrue(api.feature_exists('local'))
57 |         df: pl.DataFrame = api.query("local like %benchmark.cnf", [], ["local"])
58 |         self.assertEqual(len(df), 1)
59 |         self.assertEqual(df.to_dicts()[0]['local'], os.path.realpath(self.benchmark))
60 |         self.assertEqual(df.to_dicts()[0]['hash'], self.reference_hash)
61 | 
62 |     def test_init_cnf_features_generic(self):
63 |         api = GBD([self.file], verbose=False)
64 |         rlimits = { 'jobs': 1, 'tlim': 5000, 'mlim': 2000, 'flim': 1000 }
65 |         init_local(api, rlimits, self.dir, self.name)
66 |         df: pl.DataFrame = api.query("local like %benchmark.cnf", [], ["local"])
67 |         for key in generic_extractors.keys():
68 |             if 'cnf' in generic_extractors[key]['contexts']:
69 |                 init_features_generic(key, api, rlimits, df, self.name)
70 |                 for feature in generic_extractors[key]['features']:
71 |                     self.assertTrue(api.feature_exists(feature[0]))
72 | 
73 |     


--------------------------------------------------------------------------------
/gbd_init/initializer.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | import multiprocessing
16 | import time
17 | import pebble
18 | from concurrent.futures import as_completed
19 | import polars as pl
20 | 
21 | from gbd_core.util import eprint
22 | from gbd_core.api import GBD, GBDException
23 | from gbd_core import util
24 | import gbdc
25 | import os
26 | 
27 | 
28 | class InitializerException(Exception):
29 |     pass
30 | 
31 | 
32 | class Initializer:
33 |     def __init__(self, api: GBD, rlimits: dict, target_db: str, features: list, initfunc):
34 |         self.api = api
35 |         self.api.database.set_auto_commit(False)
36 |         self.target_db = target_db
37 |         self.features = features
38 |         self.initfunc = initfunc
39 |         self.rlimits = rlimits
40 | 
41 |     def prep_data(self, rec, hash):
42 |         return [
43 |             (key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items() if self.api.feature_exists(key)
44 |         ]
45 | 
46 |     def create_features(self):
47 |         for name, default in self.features:
48 |             self.api.database.create_feature(name, default, self.target_db, True)
49 |         self.api.database.commit()
50 | 
51 |     def save_features(self, result: list):
52 |         for attr in result:
53 |             name, hashv, value = attr[0], attr[1], attr[2]
54 |             self.api.database.set_values(name, value, [hashv], self.target_db)
55 |         self.api.database.commit()
56 | 
57 |     def run(self, instances: pl.DataFrame):
58 |         if self.rlimits["jobs"] == 1:
59 |             self.init_sequential(instances)
60 |         else:
61 |             self.init_parallel_pp(instances)
62 | 
63 |     def init_sequential(self, instances: pl.DataFrame):
64 |         for row in instances.iter_rows(named=True):
65 |             result = self.initfunc(row["hash"], row["local"], self.rlimits)
66 |             self.save_features(result)
67 | 
68 |     def init_parallel_pp(self, instances: pl.DataFrame):
69 |         with pebble.ProcessPool(max_workers=self.rlimits["jobs"], max_tasks=1, context=multiprocessing.get_context("forkserver")) as p:
70 |             futures = [p.schedule(self.initfunc, (row["hash"], row["local"], self.rlimits)) for row in instances.iter_rows(named=True)]
71 |             for f in as_completed(futures):  # , timeout=api.tlim if api.tlim > 0 else None):
72 |                 try:
73 |                     result = f.result()
74 |                     self.save_features(result)
75 |                 except pebble.ProcessExpired as e:
76 |                     f.cancel()
77 |                     util.eprint("{}: {}".format(e.__class__.__name__, e))
78 |                 except GBDException as e:  # might receive special handling in the future
79 |                     util.eprint("{}: {}".format(e.__class__.__name__, e))
80 |                 except Exception as e:
81 |                     import traceback
82 | 
83 |                     traceback.print_exc()
84 |                     util.eprint("{}: {}".format(e.__class__.__name__, e))
85 | 


--------------------------------------------------------------------------------
/tests/test_db_nonunique_features.py:
--------------------------------------------------------------------------------
 1 | ##!/usr/bin/python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import unittest
 6 | import sqlite3
 7 | 
 8 | from gbd_core.query import GBDQuery
 9 | from gbd_core.database import Database, DatabaseException
10 | from gbd_core.schema import Schema
11 | from tests import util
12 | 
13 | class DatabaseTestCase(unittest.TestCase):
14 | 
15 |     feat = "nonunique_feature"
16 |     val1 = "value1"
17 |     val2 = "value2"
18 | 
19 |     def setUp(self) -> None:
20 |         self.file = util.get_random_unique_filename('test', '.db')
21 |         sqlite3.connect(self.file).close()
22 |         self.name = Schema.dbname_from_path(self.file)
23 |         self.db = Database([self.file], verbose=False)
24 |         self.db.create_feature(self.feat, default_value=None)
25 |         self.db.set_values(self.feat, self.val1, ["a", "b", "c"])
26 |         self.db.set_values(self.feat, self.val2, ["a", "b", "c"])
27 |         return super().setUp()
28 | 
29 |     def tearDown(self) -> None:
30 |         if os.path.exists(self.file):
31 |             os.remove(self.file)
32 |         return super().tearDown()
33 | 
34 |     def query(self, feat, val):
35 |         qb = GBDQuery(self.db, "{}={}".format(feat, val))
36 |         q = qb.build_query()
37 |         return [ hash for (hash, ) in self.db.query(q) ]
38 | 
39 |     def dump(self):
40 |         import sqlite3
41 |         conn = sqlite3.connect(self.file)
42 |         for line in conn.iterdump():
43 |             print(line)
44 |         conn.close()
45 | 
46 |     # Test that the feature values are initialized correctly in test setup
47 |     def test_feature_values_exist(self):
48 |         res = self.query(self.feat, self.val1)
49 |         self.assertEqual(len(res), 3)
50 |         self.assertSetEqual(set(res), set(["a", "b", "c"]))
51 |         res = self.query(self.feat, self.val2)
52 |         self.assertEqual(len(res), 3)
53 |         self.assertSetEqual(set(res), set(["a", "b", "c"]))
54 | 
55 |     # Delete specific hash-value pair and check that it is gone and the others are still there
56 |     def test_feature_values_delete_hash_value(self):
57 |         self.db.delete(self.feat, [ self.val1 ], ["a"])
58 |         res = self.query(self.feat, self.val1)
59 |         self.assertEqual(len(res), 2)
60 |         self.assertSetEqual(set(res), set(["b", "c"]))
61 |         res = self.query(self.feat, self.val2)
62 |         self.assertEqual(len(res), 3)
63 |         self.assertSetEqual(set(res), set(["a", "b", "c"]))
64 | 
65 |     # Delete specific hash and check that it is gone and the others are still there
66 |     def test_feature_values_delete_hash(self):
67 |         self.db.delete(self.feat, [ ], ["a"])
68 |         res = self.query(self.feat, self.val1)
69 |         self.assertEqual(len(res), 2)
70 |         self.assertSetEqual(set(res), set(["b", "c"]))
71 |         res = self.query(self.feat, self.val2)
72 |         self.assertEqual(len(res), 2)
73 |         self.assertSetEqual(set(res), set(["b", "c"]))
74 |         res = self.query(self.feat, "None")
75 |         self.assertEqual(len(res), 1)
76 |         self.assertSetEqual(set(res), set(["a"]))
77 | 
78 |     # Delete specific value and check that it is gone and the others are still there
79 |     def test_feature_values_delete_value(self):
80 |         self.db.delete(self.feat, [ self.val1 ], [ ])
81 |         res = self.query(self.feat, self.val1)
82 |         self.assertEqual(len(res), 0)
83 |         res = self.query(self.feat, self.val2)
84 |         self.assertEqual(len(res), 3)
85 |         self.assertSetEqual(set(res), set([ "a", "b", "c" ]))
86 | 
87 |     # Delete feature
88 |     def test_nonunique_feature_delete(self):
89 |         self.db.delete_feature(self.feat)
90 |         self.assertRaises(DatabaseException, self.db.find, self.feat)


--------------------------------------------------------------------------------
/gbd_core/util_argparse.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | import argparse
16 | import os
17 | import re
18 | 
19 | 
20 | def get_gbd_argparser():
21 |     parser = argparse.ArgumentParser(description="GBD Benchmark Database")
22 |     parser.add_argument("-d", "--db", type=gbd_db_type, default=os.environ.get("GBD_DB"), help="Specify database to work with")
23 |     parser.add_argument("-v", "--verbose", action="store_true", help="Print additional (or diagnostic) information to stderr")
24 |     return parser
25 | 
26 | 
27 | def add_query_and_hashes_arguments(parser: argparse.ArgumentParser):
28 |     parser.add_argument("query", help="GBD Query", nargs="?")
29 |     parser.add_argument(
30 |         "--hashes", help="Explicitly select instances: Hashes can be passed as arguments to this option, but also via <stdin>.", nargs="*", default=[]
31 |     )
32 | 
33 | 
34 | def add_resource_limits_arguments(parser: argparse.ArgumentParser):
35 |     parser.add_argument("-j", "--jobs", default=1, type=int, help="Set number of parallel jobs")
36 |     parser.add_argument(
37 |         "-t",
38 |         "--tlim",
39 |         default=5000,
40 |         type=int,
41 |         help="Time limit (sec) per instance for 'init' sub-commands (also used for score calculation in 'eval' and 'plot')",
42 |     )
43 |     parser.add_argument("-m", "--mlim", default=2000, type=int, help="Memory limit (MB) per instance for 'init' sub-commands")
44 |     parser.add_argument("-f", "--flim", default=1000, type=int, help="File size limit (MB) per instance for 'init' sub-commands which create files")
45 | 
46 | 
47 | ### Argument Types for Input Sanitation in ArgParse Library
48 | def directory_type(path):
49 |     if not os.path.isdir(path):
50 |         raise argparse.ArgumentTypeError("{0} is not a directory".format(path))
51 |     if os.access(path, os.R_OK):
52 |         return os.path.abspath(path)
53 |     else:
54 |         raise argparse.ArgumentTypeError("{0} is not readable".format(path))
55 | 
56 | 
57 | def file_type(path):
58 |     if not os.path.isfile(path):
59 |         raise argparse.ArgumentTypeError("{0} is not a regular file".format(path))
60 |     if os.access(path, os.R_OK):
61 |         return os.path.abspath(path)
62 |     else:
63 |         raise argparse.ArgumentTypeError("{0} is not readable".format(path))
64 | 
65 | 
66 | def column_type(s):
67 |     pat = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$")
68 |     if not pat.match(s):
69 |         raise argparse.ArgumentTypeError('Column "{0}" does not match regular expression {1}'.format(s, pat.pattern))
70 |     return s
71 | 
72 | 
73 | def key_value_type(s):
74 |     tup = s.split("=", 1)
75 |     if len(tup) != 2:
76 |         raise argparse.ArgumentTypeError("key-value type: {0} must be separated by exactly one = ".format(s))
77 |     return (column_type(tup[0]), tup[1])
78 | 
79 | 
80 | def gbd_db_type(dbstr):
81 |     if not dbstr:
82 |         default = os.environ.get("GBD_DB")
83 |         if not default:
84 |             raise argparse.ArgumentTypeError("Datasources Missing: Set GBD_DB environment variable (Find databases here: https://benchmark-database.de)")
85 |         return default  # .split(':')
86 |     return dbstr
87 | 


--------------------------------------------------------------------------------
/tests/test_querybuilder.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sqlite3
 3 | import os
 4 | 
 5 | from gbd_core.schema import Schema
 6 | from gbd_core.database import Database
 7 | from gbd_core.query import GBDQuery
 8 | 
 9 | import tests.util as util
10 | 
11 | class QueryNonUniqueTestCase(unittest.TestCase):
12 | 
13 |     feat = "nonuniquefeature"
14 |     feat2 = "nonuniquefeature2"
15 |     feat3 = "numericfeature"
16 |     val1 = "value1"
17 |     val2 = "value2"
18 |     hashes = [ "a", "b", "c" ]
19 | 
20 |     def setUp(self) -> None:
21 |         self.file1 = util.get_random_unique_filename('test1', '.db')
22 |         self.file2 = util.get_random_unique_filename('test2', '.db')
23 |         sqlite3.connect(self.file1).close()
24 |         sqlite3.connect(self.file2).close()
25 |         self.dbname1 = Schema.dbname_from_path(self.file1)
26 |         self.dbname2 = Schema.dbname_from_path(self.file2)
27 |         self.db = Database([self.file1,self.file2], verbose=False)
28 | 
29 |         self.db.create_feature(self.feat, default_value=None, target_db=self.dbname1)
30 |         self.db.set_values(self.feat, self.val1, self.hashes)
31 | 
32 |         self.db.create_feature(self.feat, default_value=None, target_db=self.dbname2)
33 |         self.db.set_values(self.feat, self.val1, self.hashes[:1], target_db=self.dbname2)
34 |         self.db.set_values(self.feat, self.val2, self.hashes, target_db=self.dbname2)
35 | 
36 |         self.db.create_feature(self.feat2, default_value=None, target_db=self.dbname2)
37 |         self.db.set_values(self.feat2, self.val2, self.hashes)
38 | 
39 |         self.db.create_feature(self.feat3, default_value=0, target_db=self.dbname1)
40 |         self.db.set_values(self.feat3, 1, self.hashes[0])
41 |         self.db.set_values(self.feat3, 10, self.hashes[1])
42 |         self.db.set_values(self.feat3, 100, self.hashes[2])
43 | 
44 |         return super().setUp()
45 | 
46 |     def tearDown(self) -> None:
47 |         if os.path.exists(self.file1):
48 |             os.remove(self.file1)
49 |         if os.path.exists(self.file2):
50 |             os.remove(self.file2)
51 |         return super().tearDown()
52 |     
53 |     def simple_query(self, feat, val, dbname=None):
54 |         if dbname is None:
55 |             return self.query("{}={}".format(feat, val))
56 |         else:
57 |             return self.query("{}:{}={}".format(dbname, feat, val))
58 |     
59 |     def query(self, query):
60 |         q = GBDQuery(self.db, query).build_query()
61 |         return [ hash for (hash, ) in self.db.query(q) ]
62 | 
63 |     # def dump(self):
64 |     #     import sqlite3
65 |     #     conn = sqlite3.connect(self.file)
66 |     #     for line in conn.iterdump():
67 |     #         print(line)
68 |     #     conn.close()
69 | 
70 | 
71 |     def test_feature_precedence_rules(self):
72 |         res = self.simple_query(self.feat, self.val1)
73 |         self.assertEqual(len(res), 3)
74 |         res = self.simple_query(self.feat, self.val2)
75 |         self.assertEqual(len(res), 0)
76 |         res = self.simple_query(self.feat, self.val1, self.dbname1)
77 |         self.assertEqual(len(res), 3)
78 |         res = self.simple_query(self.feat, self.val2, self.dbname2)
79 |         self.assertEqual(len(res), 3)
80 | 
81 |     def test_string_inequality(self):
82 |         res = self.query("{} < {}".format(self.feat, self.val2))
83 |         self.assertEqual(len(res), 3)
84 |         res = self.query("{} > {}".format(self.feat, self.val1))
85 |         self.assertEqual(len(res), 0)
86 | 
87 |     def test_numeric_inequality(self):
88 |         res = self.query("{} < 2".format(self.feat3))
89 |         self.assertEqual(len(res), 1)
90 |         
91 |     def test_multivalued_subselect(self):
92 |         res = self.query("{db}:{f} != {v1} and {db}:{f} = {v2}".format(f=self.feat, v1=self.val1, v2=self.val2, db=self.dbname2))
93 |         self.assertEqual(len(res), 2)
94 | 
95 |     def test_feature_accessible(self):
96 |         res = self.simple_query(self.feat2, self.val2)
97 |         self.assertEqual(len(res), 3)


--------------------------------------------------------------------------------
/tests/test_db_unique_features.py:
--------------------------------------------------------------------------------
 1 | ##!/usr/bin/python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import unittest
 6 | import sqlite3
 7 | 
 8 | from gbd_core.query import GBDQuery
 9 | from gbd_core.database import Database, DatabaseException
10 | from gbd_core.schema import Schema
11 | 
12 | from tests import util
13 | 
14 | class DatabaseTestCase(unittest.TestCase):
15 | 
16 |     feat = "unique_feature"
17 |     val1 = "value1"
18 |     val2 = "value2"
19 |     defv = "empty"
20 | 
21 |     def setUp(self) -> None:
22 |         self.file = util.get_random_unique_filename('test', '.db')
23 |         sqlite3.connect(self.file).close()
24 |         self.name = Schema.dbname_from_path(self.file)
25 |         self.db = Database([self.file], verbose=False)
26 |         self.db.create_feature(self.feat, default_value=self.defv)
27 |         self.db.set_values(self.feat, self.val1, ["a", "b", "c"])
28 |         return super().setUp()
29 | 
30 |     def tearDown(self) -> None:
31 |         if os.path.exists(self.file):
32 |             os.remove(self.file)
33 |         return super().tearDown()
34 | 
35 |     def query(self, feat, val):
36 |         qb = GBDQuery(self.db, "{}={}".format(feat, val))
37 |         q = qb.build_query()
38 |         return [ hash for (hash, ) in self.db.query(q) ]
39 | 
40 |     def dump(self):
41 |         import sqlite3
42 |         conn = sqlite3.connect(self.file)
43 |         for line in conn.iterdump():
44 |             print(line)
45 |         conn.close()
46 | 
47 |     # Test that the feature values are initialized correctly in test setup
48 |     def test_unique_feature_values_exist(self):
49 |         res = self.query(self.feat, self.val1)
50 |         self.assertEqual(len(res), 3)
51 |         self.assertSetEqual(set(res), set(["a", "b", "c"]))
52 | 
53 |     # Overwrite one value and check if it is set correctly and that the other values are still there
54 |     def test_unique_feature_values_overwrite(self):
55 |         self.db.set_values(self.feat, self.val2, ["a"])
56 |         res = self.query(self.feat, self.val1)
57 |         self.assertEqual(len(res), 2)
58 |         self.assertSetEqual(set(res), set(["b", "c"]))
59 |         res2 = self.query(self.feat, self.val2)
60 |         self.assertEqual(len(res2), 1)
61 |         self.assertSetEqual(set(res2), set(["a"]))
62 | 
63 |     # Delete specific hash-value pair and check if it is deleted (=set to default value) and that the other values are still there
64 |     def test_unique_feature_values_delete_hash_value(self):
65 |         self.db.delete(self.feat, [ self.val1 ], ["a"])
66 |         res = self.query(self.feat, self.val1)
67 |         self.assertEqual(len(res), 2)
68 |         self.assertSetEqual(set(res), set(["b", "c"]))
69 |         res = self.query(self.feat, self.defv)
70 |         self.assertEqual(len(res), 1)
71 |         self.assertSetEqual(set(res), set(["a"]))
72 | 
73 |     # Delete specific hash and check if it is deleted (=set to default value) and that the other values are still there
74 |     def test_unique_feature_values_delete_hash(self):
75 |         self.db.delete(self.feat, [ ], ["a"])
76 |         res = self.query(self.feat, self.val1)
77 |         self.assertEqual(len(res), 2)
78 |         self.assertSetEqual(set(res), set(["b", "c"]))
79 |         res = self.query(self.feat, self.defv)
80 |         self.assertEqual(len(res), 1)
81 |         self.assertSetEqual(set(res), set(["a"]))
82 | 
83 |     # Delete specific value and check if it is deleted (=set to default value) and that the other values are still there
84 |     def test_unique_feature_values_delete_value(self):
85 |         self.db.delete(self.feat, [ self.val1 ], [ ])
86 |         res = self.query(self.feat, self.val1)
87 |         self.assertEqual(len(res), 0)
88 |         res = self.query(self.feat, self.defv)
89 |         self.assertEqual(len(res), 3)
90 |         self.assertSetEqual(set(res), set([ "a", "b", "c" ]))
91 | 
92 |     # Delete feature
93 |     def test_unique_feature_delete(self):
94 |         self.db.delete_feature(self.feat)
95 |         self.assertRaises(DatabaseException, self.db.find, self.feat)
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Global Benchmark Database (GBD)
 2 | 
 3 | [![DOI](https://zenodo.org/badge/141396410.svg)](https://doi.org/10.5281/zenodo.17820182)
 4 | 
 5 | GBD is a comprehensive suite of tools for provisioning and sustainably maintaining benchmark instances and their metadata for empirical research on hard algorithmic problem classes.
 6 | For an introduction to the GBD concept, the underlying data model, and specific use cases, please refer to our [2024 SAT Tool Paper](https://doi.org/10.4230/LIPIcs.SAT.2024.18).
 7 | 
 8 | ## GBD 5.0 Release Notes
 9 | 
10 | In addition to several bug fixes and performance improvements, GBD 5.0 no longer depends on Pandas for its interface module.
11 | This simplifies installation and use in various environments.
12 | The faster, more lightweight Polars library is now used for dataframes instead.
13 | Therefore, upgrading to GBD 5.0 requires existing code to be adapted to use Polars dataframes, or Polars dataframes to be explicitly converted to Pandas dataframes (e.g. via df.to_pandas()).
14 | 
15 | ## GBD contributes data to your algorithmic evaluations
16 | 
17 | GBD provides benchmark instance identifiers, feature extractors, and instance transformers for hard algorithmic problem domains, now including propositional satisfiability (SAT) and optimization (MaxSAT), and pseudo-Boolean optimization (PBO).
18 | 
19 | ## GBD solves several problems
20 | 
21 | - benchmark instance identification
22 | - identification of equivalence classes of benchmark instances
23 | - distribution of benchmark instances and benchmark metadata
24 | - initialization and maintenance of instance feature databases
25 | - transformation algorithms for benchmark instances
26 | 
27 | GBD provides an extensible set of problem domains, feature extractors, and instance transformers.
28 | For a description of those currently supported, see the [GBDC documentation](https://udopia.github.io/gbdc/doc/Index.html).
29 | GBDC is a Python extension module for GBD's performance-critical code (written in C++), maintained in a separate [repository](https://github.com/Udopia/gbdc).
30 | 
31 | ## Installation and Configuration
32 | 
33 | - Run `pip install gbd-tools`
34 | - Run `pip install gbdc` (optional, installation of extension module gbdc)
35 | - Obtain a GBD database, e.g. download [https://benchmark-database.de/getdatabase/meta.db](https://benchmark-database.de/getdatabase/meta.db).
36 | - Configure your environment by registering paths to databases like this `export GBD_DB=path/to/database1:path/to/database2`.
37 | - Test the command line interface with the `gbd info` and `gbd --help` commands.
38 | 
39 | ## GBD Interfaces
40 | 
41 | GBD provides the command-line tool `gbd`, the web interface `gbd serve`, and the Python interface `gbd_core.api.GBD`.
42 | 
43 | ### GBD Command-Line Interface
44 | 
45 | Central commands in gbd are those for data access `gbd get` and database initialization `gbd init`.
46 | See `gbd --help` for more commands.
47 | Once a database is registered in the environment variable `GBD_DB`, the `gbd get` command can be used to access data.
48 | See `gbd get --help` for more information.
49 | `gbd init` provides access to registered feature extractors, such as those provided by the `gdbc` extension module.
50 | All initialization routines can be run in parallel, and resource limits can be set per process.
51 | See `gbd init --help` for more information.
52 | 
53 | ### GBD Server
54 | 
55 | The GBD server can be started locally with gbd serve. Our instance of the GBD server is hosted at [https://benchmark-database.de/](https://benchmark-database.de/).
56 | You can download benchmark instances and prebuilt feature databases from there.
57 | 
58 | ### GBD Python Interface
59 | 
60 | The GBD Python interface is used by all programs in the GBD ecosystem. Important here is the query command, which returns GBD data in the form of a Pandas dataframe for further analysis, as shown in the following example.
61 | 
62 | ```Python
63 | from gbd_core.api import GBD
64 | with GBD(['path/to/database1', 'path/to/database2', ..] as gbd:
65 |     df = gbd.query("family = hardware-bmc", resolve=['verified-result', 'runtime-kissat'])
66 | ```
67 | 
68 | Scripts and use cases of GBD's Python interface are available on [https://udopia.github.io/gbdeval/](https://udopia.github.io/gbdeval/).
69 | The [evaluation demo](https://udopia.github.io/gbdeval/demo_evaluation.html) demonstrates portfolio analysis and subsequent category-wise performance evaluation using the 2023 SAT competition data.
70 | The [prediction demo](https://udopia.github.io/gbdeval/demo_prediction.html) demonstrates category prediction from instance features and subsequent feature importance evaluation.
71 | 
72 | 


--------------------------------------------------------------------------------
/gbd_core/util.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | 
 19 | # Thanks to Boris V. for this code https://stackoverflow.com/questions/4675728/redirect-stdout-to-a-file-in-python
 20 | from contextlib import contextmanager
 21 | 
 22 | 
 23 | def fileno(file_or_fd):
 24 |     fd = getattr(file_or_fd, "fileno", lambda: file_or_fd)()
 25 |     if not isinstance(fd, int):
 26 |         raise ValueError("Expected a file (`.fileno()`) or a file descriptor")
 27 |     return fd
 28 | 
 29 | 
 30 | @contextmanager
 31 | def stdout_redirected(to=os.devnull, stdout=None):
 32 |     if stdout is None:
 33 |         stdout = sys.stdout
 34 | 
 35 |     stdout_fd = fileno(stdout)
 36 |     # copy stdout_fd before it is overwritten
 37 |     # NOTE: `copied` is inheritable on Windows when duplicating a standard stream
 38 |     with os.fdopen(os.dup(stdout_fd), "wb") as copied:
 39 |         stdout.flush()  # flush library buffers that dup2 knows nothing about
 40 |         try:
 41 |             os.dup2(fileno(to), stdout_fd)  # $ exec >&to
 42 |         except ValueError:  # filename
 43 |             with open(to, "wb") as to_file:
 44 |                 os.dup2(to_file.fileno(), stdout_fd)  # $ exec > to
 45 |         try:
 46 |             yield stdout  # allow code to be run with the redirected stdout
 47 |         finally:
 48 |             # restore stdout to its previous value
 49 |             # NOTE: dup2 makes stdout_fd inheritable unconditionally
 50 |             stdout.flush()
 51 |             os.dup2(copied.fileno(), stdout_fd)  # $ exec >&copied
 52 | 
 53 | 
 54 | def slice_iterator(data, slice_len):
 55 |     it = iter(data)
 56 |     while True:
 57 |         items = []
 58 |         for index in range(slice_len):
 59 |             try:
 60 |                 item = next(it)
 61 |             except StopIteration:
 62 |                 if items == []:
 63 |                     return  # we are done
 64 |                 else:
 65 |                     break  # exits the "for" loop
 66 |             items.append(item)
 67 |         yield items
 68 | 
 69 | 
 70 | def is_number(s):
 71 |     try:
 72 |         if s is not None:
 73 |             float(s)
 74 |             return True
 75 |     except ValueError:
 76 |         return False
 77 |     return False
 78 | 
 79 | 
 80 | def eprint(*args, **kwargs):
 81 |     print(*args, file=sys.stderr, **kwargs)
 82 | 
 83 | 
 84 | def read_hashes():
 85 |     eprint("Reading hashes from stdin ...")
 86 |     hashes = list()
 87 |     try:
 88 |         while True:
 89 |             line = sys.stdin.readline().split()
 90 |             if len(line) == 0:
 91 |                 return hashes
 92 |             hashes.extend(line)
 93 |     except KeyboardInterrupt:
 94 |         return hashes
 95 |     return hashes
 96 | 
 97 | 
 98 | def confirm(prompt="Confirm", resp=False):
 99 |     """
100 |     prompts for yes or no response from the user. Returns True for yes and False for no.
101 |     'resp' should be set to the default value assumed by the caller when user simply types ENTER.
102 |     """
103 |     if resp:
104 |         prompt = "%s [%s]|%s: " % (prompt, "y", "n")
105 |     else:
106 |         prompt = "%s [%s]|%s: " % (prompt, "n", "y")
107 | 
108 |     while True:
109 |         ans = "z"
110 |         try:
111 |             ans = input(prompt)
112 |         except EOFError:
113 |             # This hack is for OSX and Linux only
114 |             # There EOFError occurs when hashes were read from stdin before
115 |             # Reopening stdin in order to facilitate subsequent user input:
116 |             sys.stdin = open("/dev/tty", mode="r")
117 |             ans = input()
118 |         if not ans:
119 |             return resp
120 |         if ans not in ["y", "Y", "n", "N"]:
121 |             print("please enter y or n.")
122 |             continue
123 |         if ans == "y" or ans == "Y":
124 |             return True
125 |         if ans == "n" or ans == "N":
126 |             return False
127 | 


--------------------------------------------------------------------------------
/gbd_core/query.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | 
 16 | from gbd_core.database import Database, DatabaseException
 17 | from gbd_core.grammar import Parser
 18 | from gbd_core import contexts
 19 | from gbd_core.schema import Schema
 20 | 
 21 | 
 22 | class GBDQuery:
 23 |     def __init__(self, db: Database, query):
 24 |         self.db = db
 25 |         self.parser = Parser(query)
 26 |         self.features = self.parser.get_features()
 27 | 
 28 |     def features_exist_or_throw(self, features):
 29 |         for feature in features:
 30 |             self.db.find(feature)
 31 | 
 32 |     # Generate SQL Query from given GBD Query
 33 |     def build_query(self, hashes=[], resolve=[], group_by=None, join_type="LEFT", collapse=None):
 34 |         group = group_by or self.determine_group_by(resolve)
 35 | 
 36 |         self.features_exist_or_throw(resolve + [group] + list(self.features))
 37 | 
 38 |         sql_select = self.build_select(group, resolve, collapse)
 39 | 
 40 |         sql_from = self.build_from(group, set(resolve) | self.features, join_type)
 41 | 
 42 |         sql_where = self.build_where(hashes, group)
 43 | 
 44 |         sql_groupby = "GROUP BY {}".format(self.db.faddr(group)) if collapse else ""
 45 |         sql_orderby = "ORDER BY {}".format(self.db.faddr(group))
 46 | 
 47 |         return "{} {} WHERE {} {} {}".format(sql_select, sql_from, sql_where, sql_groupby, sql_orderby)
 48 | 
 49 |     def determine_group_by(self, resolve):
 50 |         if len(resolve) == 0:
 51 |             return self.db.dcontext(self.db.find("hash").database) + ":hash"
 52 |         else:
 53 |             return self.db.dcontext(self.db.find(resolve[0]).database) + ":hash"
 54 | 
 55 |     def build_select(self, group_by, resolve, collapse=None):
 56 |         result = [self.db.faddr(f) for f in [group_by] + resolve]
 57 |         if collapse and collapse != "none":
 58 |             result = ["{}(DISTINCT {})".format(collapse, r) for r in result]
 59 |         return "SELECT DISTINCT " + ", ".join(result)
 60 | 
 61 |     def find_translator_feature(self, source_context, target_context):
 62 |         for dbname in self.db.get_databases(source_context):
 63 |             # eprint("Checking database {} for translator".format(dbname))
 64 |             if "to_" + target_context in self.db.get_features([dbname]):
 65 |                 return self.db.find("to_" + target_context, dbname)
 66 | 
 67 |         for dbname in self.db.get_databases(target_context):
 68 |             # eprint("Checking database {} for translator".format(dbname))
 69 |             if "to_" + source_context in self.db.get_features([dbname]):
 70 |                 return self.db.find("to_" + source_context, dbname)
 71 | 
 72 |         raise DatabaseException("No translator feature found for contexts {} and {}".format(source_context, target_context))
 73 | 
 74 |     def build_from(self, group, features, join_type="LEFT"):
 75 |         result = dict()
 76 | 
 77 |         gdatabase = self.db.find(group).database
 78 |         gtable = self.db.find(group).table
 79 |         gcontext = self.db.dcontext(gdatabase)
 80 |         gaddress = gdatabase + "." + gtable
 81 |         result[gaddress] = "FROM {}".format(gaddress)
 82 | 
 83 |         tables = set([(finfo.database, finfo.table) for finfo in [self.db.find(f) for f in features]])
 84 |         for fdatabase, ftable in tables:
 85 |             faddress = fdatabase + "." + ftable
 86 |             ffeatures_address = fdatabase + ".features"
 87 |             if not faddress in result:  # join only once
 88 |                 fcontext = self.db.dcontext(fdatabase)
 89 |                 if fcontext == gcontext:
 90 |                     if faddress == ffeatures_address:  # join features table directly
 91 |                         result[faddress] = "{j} JOIN {t} ON {t}.hash = {g}.hash".format(j=join_type, t=ffeatures_address, g=gaddress)
 92 |                     else:  # join non-unique features table via features table
 93 |                         fname = ftable
 94 |                         if not ffeatures_address in result:
 95 |                             result[ffeatures_address] = "{j} JOIN {t} ON {t}.hash = {g}.hash".format(j=join_type, t=ffeatures_address, g=gaddress)
 96 |                         result[faddress] = "{j} JOIN {t} ON {t}.hash = {ft}.{n}".format(j=join_type, t=faddress, ft=ffeatures_address, n=fname)
 97 |                 else:
 98 |                     tfeat = self.find_translator_feature(gcontext, fcontext)
 99 |                     direction = ("hash", "value") if self.db.dcontext(tfeat.database) == gcontext else ("value", "hash")
100 | 
101 |                     taddress = tfeat.database + "." + tfeat.table
102 |                     if not taddress in result:
103 |                         result[taddress] = "INNER JOIN {trans} ON {group}.hash = {trans}.{dir0}".format(trans=taddress, group=gaddress, dir0=direction[0])
104 | 
105 |                     result[faddress] = "INNER JOIN {feat} ON {trans}.{dir1} = {feat}.hash".format(feat=faddress, trans=taddress, dir1=direction[1])
106 | 
107 |         return " ".join(result.values())
108 | 
109 |     def build_where(self, hashes, group_by):
110 |         group_column = self.db.faddr(group_by)
111 |         group_table = self.db.faddr_table(group_by)
112 |         result = group_column + " != 'None' AND " + self.parser.get_sql(self.db)
113 |         if len(hashes):
114 |             result = result + " AND {}.hash in ('{}')".format(group_table, "', '".join(hashes))
115 |         return result
116 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import unittest
  4 | import sqlite3
  5 | import polars as pl
  6 | 
  7 | from gbd_core.api import GBD, GBDException
  8 | from gbd_core.schema import Schema
  9 | 
 10 | from tests import util
 11 | 
 12 | class APITestCase(unittest.TestCase):
 13 | 
 14 |     def setUp(self) -> None:
 15 |         self.file1 = util.get_random_unique_filename('test1', '.db')
 16 |         self.file2 = util.get_random_unique_filename('test2', '.db')
 17 |         sqlite3.connect(self.file1).close()
 18 |         sqlite3.connect(self.file2).close()
 19 |         self.name1 = Schema.dbname_from_path(self.file1)
 20 |         self.name2 = Schema.dbname_from_path(self.file2)
 21 |         self.api = GBD([self.file1, self.file2])
 22 |         return super().setUp()
 23 | 
 24 |     def tearDown(self) -> None:
 25 |         if os.path.exists(self.file1):
 26 |             os.remove(self.file1)
 27 |         if os.path.exists(self.file2):
 28 |             os.remove(self.file2)
 29 |         return super().tearDown()
 30 | 
 31 |     def test_databases_exist(self):
 32 |         self.assertEquals(self.api.get_databases(), [ self.name1, self.name2 ])
 33 |         self.assertEquals(self.api.get_database_path(self.name1), self.file1)
 34 |         self.assertEquals(self.api.get_database_path(self.name2), self.file2)
 35 | 
 36 |     def test_create_feature(self):
 37 |         self.api.create_feature("A", None, self.name1)
 38 |         self.assertTrue(self.api.feature_exists("A"))
 39 |         self.api.create_feature("A", None, self.name2)
 40 |         api2 = GBD([self.file2])
 41 |         self.assertTrue(api2.feature_exists("A"))
 42 |         with self.assertRaises(GBDException):
 43 |             self.api.create_feature("A", None, self.name1)
 44 |         with self.assertRaises(GBDException):
 45 |             self.api.create_feature("A", None, self.name2)
 46 | 
 47 |     def test_delete_feature(self):
 48 |         self.api.create_feature("A", None, self.name1)
 49 |         self.api.create_feature("A", None, self.name2)
 50 |         self.api.delete_feature("A", self.name1)
 51 |         self.assertFalse(self.api.feature_exists("A", self.name1))
 52 |         self.assertTrue(self.api.feature_exists("A"))
 53 |         self.assertTrue(self.api.feature_exists("A", self.name2))
 54 |         self.api.delete_feature("A")
 55 |         self.assertFalse(self.api.feature_exists("A"))
 56 | 
 57 |     def test_rename_feature(self):
 58 |         self.api.create_feature("A", None, self.name1)
 59 |         self.api.create_feature("B", None, self.name1)
 60 |         self.api.create_feature("A", None, self.name2)
 61 |         self.api.rename_feature("A", "B", self.name2)
 62 |         self.assertFalse(self.api.feature_exists("A", self.name2))
 63 |         self.assertTrue(self.api.feature_exists("B", self.name2))
 64 |         self.assertTrue(self.api.feature_exists("A", self.name1))
 65 |         self.assertTrue(self.api.feature_exists("B", self.name1))
 66 |         with self.assertRaises(GBDException):
 67 |             self.api.rename_feature("A", "B", self.name1)
 68 | 
 69 |     def test_set_values(self):
 70 |         self.api.create_feature("A", None, self.name1) # feature is multi-valued
 71 |         self.api.create_feature("B", "empty", self.name1) # feature has default value
 72 |         self.api.create_feature("A", "empty", self.name2) # shadowed feature
 73 |         # value1 (set values, default values emerge)
 74 |         self.api.set_values("A", "value1", [ str(i) for i in range(100) ], self.name1)
 75 |         df: pl.DataFrame = self.api.query("A = value1", resolve=["A", "B"])
 76 |         self.assertCountEqual(df['hash'].to_list(), [ str(i) for i in range(100) ])
 77 |         self.assertCountEqual(df['A'].to_list(), [ "value1" for _ in range(100) ])
 78 |         self.assertCountEqual(df['B'].to_list(), [ "empty" for _ in range(100) ])
 79 |         # value2 (set values, feature is multi-valued)
 80 |         self.api.set_values("A", "value2", [ str(i) for i in range(50) ], self.name1)
 81 |         df: pl.DataFrame = self.api.query("A = value1 or A = value2", resolve=["A"], collapse=None)
 82 |         self.assertCountEqual(df['A'].to_list(), [ "value2" for _ in range(50) ] + [ "value1" for _ in range(100) ])
 83 |         # value3 (set values of shadowed feature by specifying target-database)
 84 |         self.api.set_values("A", "value3", [ str(i) for i in range(50) ], self.name2)
 85 |         df: pl.DataFrame = self.api.query("A = value1 or A = value2", resolve=["A"], collapse=None)
 86 |         self.assertCountEqual(df['A'].to_list(), [ "value2" for _ in range(50) ] + [ "value1" for _ in range(100) ])
 87 |         self.api.database.commit()
 88 |         api2 = GBD([self.file2])
 89 |         df: pl.DataFrame = api2.query("A = value3", resolve=["A"])
 90 |         self.assertCountEqual(df["A"].to_list(), [ "value3" for _ in range(50) ])
 91 | 
 92 |     def test_reset_values(self):
 93 |         self.api.create_feature("A", None, self.name1)
 94 |         self.api.create_feature("B", "empty", self.name1)
 95 |         self.api.create_feature("A", "empty", self.name2)
 96 |         self.api.set_values("A", "value1", [ str(i) for i in range(100) ], self.name1)
 97 |         self.api.set_values("A", "value2", [ str(i) for i in range(100) ], self.name1)
 98 |         self.api.set_values("B", "value3", [ str(i) for i in range(100) ], self.name1)
 99 |         self.api.set_values("A", "value1", [ str(i) for i in range(100) ], self.name2)
100 |         # reset values in A
101 |         self.api.reset_values("A", [ "value1" ], [ str(i) for i in range(50) ], self.name1)
102 |         df: pl.DataFrame = self.api.query(None, hashes=[ str(i) for i in range(100) ], resolve=["A"], collapse=None)
103 |         self.assertCountEqual(df['A'].to_list(), [ "value1" for _ in range(50) ] + [ "value2" for _ in range(100) ])
104 |         # reset values in B
105 |         self.api.reset_values("B", [ "value3" ], [ str(i) for i in range(50) ], self.name1)
106 |         df: pl.DataFrame = self.api.query(None, hashes=[ str(i) for i in range(100) ], resolve=["B"])
107 |         self.assertCountEqual(df['B'].to_list(), [ "value3" for _ in range(50) ] + [ "empty" for _ in range(50) ])
108 |         # reset values in shadowed A
109 |         self.api.database.verbose = True
110 |         self.api.reset_values("A", [ "value1" ], [ str(i) for i in range(50) ], self.name2)
111 |         self.api.database.commit()
112 |         api2 = GBD([self.file2])
113 |         df: pl.DataFrame = api2.query("A = value1", resolve=["A"])
114 |         self.assertCountEqual(df["A"].to_list(), [ "value1" for _ in range(50) ])


--------------------------------------------------------------------------------
/gbd_init/instance_transformers.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | 
 16 | import os
 17 | import polars as pl
 18 | from functools import reduce
 19 | 
 20 | from gbd_core import contexts
 21 | from gbd_core.api import GBD, GBDException
 22 | from gbd_core import util
 23 | 
 24 | from gbd_core.contexts import identify
 25 | from gbd_init.initializer import Initializer, InitializerException
 26 | 
 27 | try:
 28 |     from gbdc import cnf2kis, sanitise, normalise
 29 | except ImportError:
 30 | 
 31 |     def cnf2kis(ipath, opath):
 32 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 33 | 
 34 |     def sanitise(ipath, opath):
 35 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 36 | 
 37 |     def normalise(ipath, opath):
 38 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 39 | 
 40 | 
 41 | def kis_filename(path):
 42 |     kispath = reduce(lambda path, suffix: path[: -len(suffix)] if path.endswith(suffix) else path, contexts.suffixes("cnf"), path)
 43 |     return kispath + ".kis"
 44 | 
 45 | 
 46 | def sanitised_filename(path):
 47 |     sanpath = reduce(lambda path, suffix: path[: -len(suffix)] if path.endswith(suffix) else path, contexts.suffixes("cnf"), path)
 48 |     return sanpath + ".sanitized.cnf"
 49 | 
 50 | 
 51 | def normalised_filename(path):
 52 |     normpath = reduce(lambda path, suffix: path[: -len(suffix)] if path.endswith(suffix) else path, contexts.suffixes("cnf"), path)
 53 |     return normpath + ".normalised.cnf"
 54 | 
 55 | 
 56 | def wrap_cnf2kis(hash, path, limits):
 57 |     kispath = kis_filename(path)
 58 |     util.eprint("Transforming {} to k-ISP {}".format(path, kispath))
 59 |     try:
 60 |         result = cnf2kis(path, kispath)
 61 |         if "local" in result:
 62 |             kishash = result["hash"]
 63 |             return [
 64 |                 ("local", kishash, result["local"]),
 65 |                 ("to_cnf", kishash, hash),
 66 |                 ("nodes", kishash, result["nodes"]),
 67 |                 ("edges", kishash, result["edges"]),
 68 |                 ("k", kishash, result["k"]),
 69 |             ]
 70 |         else:
 71 |             raise GBDException("CNF2KIS failed for {} due to {}".format(path, result["hash"]))
 72 |     except Exception as e:
 73 |         util.eprint(str(e))
 74 |         if os.path.exists(kispath):
 75 |             os.remove(kispath)
 76 | 
 77 |     return []
 78 | 
 79 | 
 80 | def wrap_sanitise(hash, path, limits):
 81 |     sanpath = sanitised_filename(path)
 82 |     util.eprint("Sanitising {}".format(path))
 83 |     try:
 84 |         with open(sanpath, "w") as f, util.stdout_redirected(f):
 85 |             result = sanitise(path, sanpath)
 86 |             if "local" in result:
 87 |                 sanhash = result["hash"]
 88 |                 return [("local", sanhash, result["local"]), ("to_cnf", sanhash, hash)]
 89 |             else:
 90 |                 raise GBDException("Sanitization failed for {}".format(path))
 91 |     except Exception as e:
 92 |         util.eprint(str(e))
 93 |         if os.path.exists(sanpath):
 94 |             os.remove(sanpath)
 95 | 
 96 |     return []
 97 | 
 98 | 
 99 | def wrap_normalise(hash, path, limits):
100 |     normpath = normalised_filename(path)
101 |     util.eprint("Normalising {}".format(path))
102 |     try:
103 |         with open(normpath, "w") as f, util.stdout_redirected(f):
104 |             result = normalise(path, normpath)
105 |             normhash = result["hash"]
106 |             if "local" in result and hash == normhash:
107 |                 return [("local", normhash, result["local"])]
108 |             else:
109 |                 raise GBDException("Normalisation failed for {}".format(path))
110 |     except Exception as e:
111 |         util.eprint(str(e))
112 |         if os.path.exists(normpath):
113 |             os.remove(normpath)
114 | 
115 |     return []
116 | 
117 | 
118 | def transform_instances_generic(key: str, api: GBD, rlimits, query, hashes, target_db, source, collapse=None):
119 |     einfo = generic_transformers[key]
120 |     context = api.database.dcontext(target_db)
121 |     if not context in einfo["target"]:
122 |         raise InitializerException("Target database context must be in {}".format(einfo["target"]))
123 |     if not source in einfo["source"]:
124 |         raise InitializerException("Source database context must be in {}".format(einfo["source"]))
125 |     transformer = Initializer(api, rlimits, target_db, einfo["features"], einfo["compute"])
126 |     transformer.create_features()
127 |     
128 |     def path_exists(p):
129 |         return p is not None and os.path.exists(einfo["filename"](p))
130 | 
131 |     df: pl.DataFrame = api.query(query, hashes, [source + ":local"], collapse=collapse)
132 |     missing = df.with_columns(
133 |         exists=pl.col("local").map_elements(
134 |             path_exists,
135 |             return_dtype=pl.Boolean
136 |         )
137 |     ).filter(~pl.col("exists"))
138 | 
139 |     transformer.run(missing)
140 | 
141 | 
142 | generic_transformers = {
143 |     "sanitise": {
144 |         "description": "Sanitise CNF files. ",
145 |         "source": ["cnf"],
146 |         "target": ["sancnf"],
147 |         "features": [("local", None), ("to_cnf", None)],
148 |         "compute": wrap_sanitise,
149 |         "filename": sanitised_filename,
150 |     },
151 |     "normalise": {
152 |         "description": "Normalise CNF files. ",
153 |         "source": ["cnf"],
154 |         "target": ["cnf"],
155 |         "features": [("local", None)],
156 |         "compute": wrap_normalise,
157 |         "filename": normalised_filename,
158 |     },
159 |     "cnf2kis": {
160 |         "description": "Transform CNF files to k-ISP instances. ",
161 |         "source": ["cnf"],
162 |         "target": ["kis"],
163 |         "features": [("local", None), ("to_cnf", None), ("nodes", "empty"), ("edges", "empty"), ("k", "empty")],
164 |         "compute": wrap_cnf2kis,
165 |         "filename": kis_filename,
166 |     },
167 | }
168 | 


--------------------------------------------------------------------------------
/gbd_server/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html lang="en">
  4 | <head>
  5 |     <title>Global Benchmark Database</title>
  6 |     <link rel="stylesheet" href="{{ url_for('static', filename='main.css') }}" />
  7 |     <!--<script src="{{ url_for('static', filename='jquery-3.6.0.min.js') }}"></script>
  8 |     <script src="{{ url_for('static', filename='sorttable.js') }}"></script>
  9 |     <script src="{{ url_for('static', filename='w3.js') }}"></script>-->
 10 | </head>
 11 | 
 12 | <body>
 13 | <div class="main">
 14 |     <header>
 15 |         <img center src="{{ url_for('static', filename='img/gbd_logo_small.png') }}" alt="GBD-Logo">
 16 |         <div class="form">
 17 |             <form method="post" id="contextform">
 18 |             <fieldset style="float:left">
 19 |                 <legend>Select a context</legend>
 20 |                 <input type="hidden" id="selected_db" name="selected_db" value="">
 21 |                 <input type="hidden" id="query" name="query" value="">
 22 |                 {% for c in contexts %}
 23 |                 <button type="submit" id="context" name="context" value="{{ c }}" formaction="{{ url_for('quick_search') }}" title="Select problem domain">
 24 |                     {% if c == context %}
 25 |                     <span style="color:var(--link-color-1)">{{ c }}</span>
 26 |                     {% else %}
 27 |                     {{ c }}
 28 |                     {% endif %}
 29 |                 </button>
 30 |                 {% endfor %}
 31 |             </fieldset>
 32 |             </form>
 33 |             <form method="post" id="dbform">
 34 |             <fieldset>
 35 |                 <legend>Select a context-specific database</legend>
 36 |                 <input type="hidden" id="context" name="context" value="{{ context }}">
 37 |                 <input type="hidden" id="query" name="query" value="{{ query }}">
 38 |                 {% for dbname in databases %}
 39 |                 <button type="submit" id="selected_db" name="selected_db" value="{{ dbname }}" formaction="{{ url_for('quick_search') }}" title="Select database to display">
 40 |                     {% if dbname == selected %}
 41 |                     <span style="color:var(--link-color-1)">{{ dbname }}</span>
 42 |                     {% else %}
 43 |                     {{ dbname }}
 44 |                     {% endif %}
 45 |                 </button>
 46 |                 {% endfor %}
 47 |             </fieldset>
 48 |             </form>
 49 |             <br />
 50 |             <form method="post" id="mainform">
 51 |                 <fieldset>
 52 |                     <legend>Query for Instances</legend>
 53 |                     <input type="hidden" id="context" name="context" value="{{ context }}">
 54 |                     <input type="hidden" id="selected_db" name="selected_db" value="{{ selected }}">
 55 |                     <input type="text" id="query" name="query" class="query" placeholder="Query for Instances" value="{{ query }}">
 56 |                     <button type="submit" class="submit" formaction="{{ url_for('quick_search') }}" form="mainform" id="queryaction" value="show" title="Query for instances and selected features">Show</button>
 57 |                 </fieldset>
 58 |                 <br />
 59 |                 <div class="info">
 60 |                     {% if error is not none %}
 61 |                     <span style="color:var(--link-color-1)">{{ error }}</span>
 62 |                     {% endif %} 
 63 |                 </div>
 64 |             </form>
 65 |         </div>
 66 |         <div class="help">
 67 |             <fieldset>
 68 |                 <legend>Quickstart</legend>
 69 |                 <ul>
 70 |                     <li><b>Query for Instances:</b>
 71 |                         Simple queries are constraints of the form "feature operator value", e.g., <a href="{{ url_for('quick_search', track='main_2024', context='cnf') }}">track=main_2024</a>. More complex queries can be formulated as specified in <a href="https://doi.org/10.4230/LIPIcs.SAT.2024.18">our SAT 2024 paper</a>.
 72 |                     </li>
 73 |                     <li><b>Download Instances:</b>
 74 |                         The file <a href="{{ url_for('get_url_file', query=query, context=context) }}">{{ query_name }}.uri</a> contains the download links for the selected instances. Use it to download the instances, e.g., with wget like this<br /> <span style="font-family: monospace;">wget --content-disposition -i {{ query_name }}.uri</span>.
 75 |                     </li>
 76 |                     <li><b>Download the selected database:</b>
 77 |                         <a href="{{ url_for('get_database_file', database=selected) }}" title="Download Database">{{ selected }}</a>
 78 |                     </li>
 79 |                 </ul>
 80 |             </fieldset>
 81 |         </div>
 82 |     </header>
 83 |     
 84 |     <div class="content">
 85 |         <table>
 86 |             <tr>
 87 |                 <th>hash</th>
 88 |                 {% for feature in features %}
 89 |                 <th>{{ feature }}</th>
 90 |                 {% endfor %}
 91 |             </tr>
 92 |             
 93 |             {% for row in result %}
 94 |             <tr>
 95 |                 {% for item in row %}
 96 |                     {% if loop.index == 1 %}
 97 |                         <td><a href="{{ url_for('get_file', hashvalue=item, context=context) }}">{{ item }}</a></td>
 98 |                     {% elif item is link_field %}
 99 |                         <td><a href="{{ item }}">{{ item }}</a></td>
100 |                     {% elif item is int_field %}
101 |                         <td style="text-align: right;">{{ item }}</td>
102 |                     {% elif item is num_field %}
103 |                         <td style="text-align: right;">{{ '%0.2f'| format(item|float) }}</td>
104 |                     {% else %}
105 |                         <td>{{ item }}</td>
106 |                     {% endif %}
107 |                 {% endfor %}
108 |             </tr>
109 |             {% endfor %}
110 |         </table>
111 |     </div>
112 | 
113 |     <footer>
114 |         <div style="display: inline-block; vertical-align: center">
115 |             
116 |             {% if pages > 0 %}
117 |                 Found {{ total }} Instances. Select page: 
118 |                 {% for i in range(0, pages) %}
119 |                     {% if i < 2 or i > pages - 3 or (i > page - 3 and i < page + 3) %}
120 |                         {% if i == page %}
121 |                         <button type="submit" formaction="/" form="mainform" name="page" value="{{ i }}" style="color:var(--link-color-1);">{{ i + 1 }}</button>
122 |                         {% else %}
123 |                         <button type="submit" formaction="/" form="mainform" name="page" value="{{ i }}">{{ i + 1 }}</button>
124 |                         {% endif %}
125 |                     {% elif i == 3 or i == pages - 3 %}
126 |                         ...
127 |                     {% endif %}
128 |                 {% endfor %}
129 |             {% else %}
130 |                 Found {{ total }} instances.
131 |             {% endif %}
132 |         </div>
133 |         <div style="float: right;">
134 |             If you find this useful, please cite <a href="https://doi.org/10.4230/LIPIcs.SAT.2024.18">Global Benchmark Database (Iser &amp; Jabs, SAT 2024)</a>
135 |         </div>
136 |     </footer>
137 | </div>
138 | </body>
139 | </html>
140 | 


--------------------------------------------------------------------------------
/gbd_core/grammar.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | import tatsu
 16 | import json
 17 | 
 18 | from gbd_core.database import Database, DatabaseException
 19 | 
 20 | 
 21 | class ParserException(Exception):
 22 |     pass
 23 | 
 24 | 
 25 | class Parser:
 26 |     GRAMMAR = r"""
 27 |         @@grammar::GBDQuery
 28 |         @@ignorecase::True
 29 | 
 30 |         start 
 31 |             = 
 32 |             q:query $ 
 33 |             ;
 34 | 
 35 |         query 
 36 |             = 
 37 |             | left:query qop:("and" | "or") ~ right:query 
 38 |             | qop:("not") ~ q:query
 39 |             | constraint 
 40 |             | "(" q:query ")" 
 41 |             ;
 42 | 
 43 |         constraint 
 44 |             = 
 45 |             | col:(dbname ":" column | column) cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) ter:termstart
 46 |             | col:(dbname ":" column | column) cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) num:number 
 47 |             | col:(dbname ":" column | column) cop:("=" | "!=" | "<=" | ">=" | "<" | ">" ) str:string 
 48 |             | col:(dbname ":" column | column) cop:("like" | "unlike") ~ lik:(["%"] string ["%"])
 49 |             ;
 50 | 
 51 |         termstart 
 52 |             = 
 53 |             ("(") t:term (")")
 54 |             ;
 55 | 
 56 |         term 
 57 |             = 
 58 |             | left:term top:("+" | "-" | "*" | "/") right:term
 59 |             | ("(") t:term (")")
 60 |             | constant:number
 61 |             | col:(dbname ":" column | column)
 62 |             ;
 63 | 
 64 |         string
 65 |             =
 66 |             | "'" @:singlequotedstring "'"
 67 |             | '"' @:doublequotedstring '"'
 68 |             | /[a-zA-Z0-9_\.\-\/\,\:\+\=\@]+/
 69 |             ;
 70 | 
 71 |         # number = /[-]?[0-9]+[.]?[0-9]*/ ;
 72 |         number = /[-]?[0-9]+(?:\.[0-9]+)?(?![A-Za-z0-9_])/ ;
 73 |         singlequotedstring = /[a-zA-Z0-9_\.\-\/\,\:\+\=\@\s"\*\\]+/ ;
 74 |         doublequotedstring = /[a-zA-Z0-9_\.\-\/\,\:\+\=\@\s'\*\\]+/ ;
 75 |         column = /[a-zA-Z][a-zA-Z0-9_]*/ ;
 76 |         dbname = /[a-zA-Z][a-zA-Z0-9_]*/ ;
 77 |     """
 78 | 
 79 |     model = tatsu.compile(GRAMMAR)
 80 | 
 81 |     def __init__(self, query, verbose=False):
 82 |         try:
 83 |             self.ast = Parser.model.parse(query) if query else dict()
 84 |             if verbose:
 85 |                 print("Parsed: " + query)
 86 |                 print(json.dumps(tatsu.util.asjson(self.ast), indent=2))
 87 |         except tatsu.exceptions.FailedParse as e:
 88 |             raise ParserException(f"Failed to parse query: {str(e)}") from e
 89 | 
 90 |     def get_features(self, ast=None):
 91 |         # import pprint
 92 |         # pp = pprint.PrettyPrinter(depth=6)
 93 |         # pp.pprint(ast)
 94 |         try:
 95 |             ast = ast if ast else self.ast
 96 |             if "q" in ast:
 97 |                 return self.get_features(ast["q"])
 98 |             elif "t" in ast:
 99 |                 return self.get_features(ast["t"])
100 |             elif "qop" in ast or "top" in ast:
101 |                 return self.get_features(ast["left"]) | self.get_features(ast["right"])
102 |             elif "cop" in ast and "ter" in ast:
103 |                 return {"".join(ast["col"])} | self.get_features(ast["ter"])
104 |             elif "col" in ast:
105 |                 return {"".join(ast["col"])}
106 |             else:
107 |                 return set()
108 |         except TypeError as e:
109 |             raise ParserException(f"Failed to parse query: {str(e)}") from e
110 | 
111 |     def get_sql(self, db: Database, ast=None):
112 |         try:
113 |             ast = ast if ast else self.ast
114 |             if "qop" in ast and ast["qop"] == "not":
115 |                 return "NOT (" + self.get_sql(db, ast["q"]) + ")"
116 |             if "q" in ast:
117 |                 return "(" + self.get_sql(db, ast["q"]) + ")"
118 |             if "t" in ast:
119 |                 return "(" + self.get_sql(db, ast["t"]) + ")"
120 |             if "qop" in ast or "top" in ast:  # query operator or term operator
121 |                 operator = ast["qop"] if ast["qop"] else ast["top"]
122 |                 left = self.get_sql(db, ast["left"])
123 |                 right = self.get_sql(db, ast["right"])
124 |                 return f"{left} {operator} {right}"
125 |             if "cop" in ast:  # constraint operator
126 |                 operator = "not like" if ast["cop"] == "unlike" else ast["cop"]
127 |                 feat = db.faddr("".join(ast["col"]))
128 |                 feat_is_1_n = db.find("".join(ast["col"])).default is None
129 |                 if "str" in ast:  # cop:("=" | "!=")
130 |                     if feat_is_1_n:
131 |                         table = db.faddr_table("".join(ast["col"]))
132 |                         setop = "IN" if ast["cop"] == "=" else "NOT IN"
133 |                         return "{t}.hash {o} (SELECT {t}.hash FROM {t} WHERE {f} = '{s}')".format(o=setop, t=table, f=feat, s=ast["str"])
134 |                     return f"{feat} {operator} '{ast['str']}'"
135 |                 if "num" in ast:  # cop:("=" | "!=" | "<=" | ">=" | "<" | ">" )
136 |                     if feat_is_1_n:
137 |                         table = db.faddr_table("".join(ast["col"]))
138 |                         return "{t}.hash IN (SELECT {t}.hash FROM {t} WHERE CAST({f} AS FLOAT) {o} {s})".format(o=operator, t=table, f=feat, s=ast["num"])
139 |                     return f"CAST({feat} AS FLOAT) {operator} {ast['num']}"
140 |                 if "lik" in ast:  # cop:("like" | "unlike")
141 |                     if feat_is_1_n:
142 |                         table = db.faddr_table("".join(ast["col"]))
143 |                         setop = "IN" if ast["cop"] == "like" else "NOT IN"
144 |                         return "{t}.hash {o} (SELECT {t}.hash FROM {t} WHERE {f} like '{s}')".format(
145 |                             o=setop, t=table, f=feat, s="".join([t for t in ast["lik"] if t])
146 |                         )
147 |                     return f"{feat} {operator} '{''.join([t for t in ast['lik'] if t])}'"
148 |                 if "ter" in ast:  # cop:("=" | "!=" | "<=" | ">=" | "<" | ">" )
149 |                     if feat_is_1_n and ast["cop"] == "!=":
150 |                         table = db.faddr_table("".join(ast["col"]))
151 |                         setop = "NOT IN" if ast["cop"] == "!=" else "IN"
152 |                         cop = "=" if ast["cop"] == "!=" else ast["cop"]
153 |                         return "{t}.hash {o} (SELECT {t}.hash FROM {t} WHERE CAST({f} AS FLOAT) {c} {s})".format(
154 |                             o=setop, c=cop, t=table, f=feat, s=self.get_sql(db, ast["ter"])
155 |                         )
156 |                     return f"CAST({feat} AS FLOAT) {operator} {self.get_sql(db, ast['ter'])}"
157 |                 raise ParserException("Missing right-hand side of constraint")
158 |             if "col" in ast:
159 |                 feature = db.faddr("".join(ast["col"]))
160 |                 return f"CAST({feature} AS FLOAT)"
161 |             if "constant" in ast:
162 |                 return ast["constant"]
163 |             return "1=1"
164 |         except TypeError as e:
165 |             raise ParserException(f"Failed to parse query: {str(e)}") from e
166 |         except DatabaseException as e:
167 |             raise ParserException(f"Failed to parse query: {str(e)}") from e
168 | 


--------------------------------------------------------------------------------
/gbd_init/feature_extractors.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | import os
 16 | import glob
 17 | import warnings
 18 | import polars as pl
 19 | 
 20 | from gbd_core.contexts import suffixes, identify, get_context_by_suffix
 21 | from gbd_core.api import GBD, GBDException
 22 | from gbd_core.util import eprint, confirm
 23 | from gbd_init.initializer import Initializer, InitializerException
 24 | 
 25 | try:
 26 |     from gbdc import (
 27 |         extract_base_features,
 28 |         base_feature_names,
 29 |         extract_gate_features,
 30 |         gate_feature_names,
 31 |         isohash,
 32 |         wcnfisohash,
 33 |         wcnf_base_feature_names,
 34 |         extract_wcnf_base_features,
 35 |         opb_base_feature_names,
 36 |         extract_opb_base_features,
 37 |         checksani,
 38 |         checksani_feature_names,
 39 |     )
 40 | except ImportError:
 41 | 
 42 |     def extract_base_features(path, tlim, mlim):
 43 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 44 | 
 45 |     def base_feature_names():
 46 |         return []
 47 | 
 48 |     def extract_gate_features(path, tlim, mlim):
 49 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 50 | 
 51 |     def gate_feature_names():
 52 |         return []
 53 | 
 54 |     def isohash(path):
 55 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 56 | 
 57 |     def extract_wcnf_base_features(path, tlim, mlim):
 58 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 59 | 
 60 |     def wcnf_base_feature_names():
 61 |         return []
 62 | 
 63 |     def extract_opb_base_features(path, tlim, mlim):
 64 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 65 | 
 66 |     def opb_base_feature_names():
 67 |         return []
 68 | 
 69 |     def checksani(path, tlim, mlim):
 70 |         raise ModuleNotFoundError("gbdc not found", name="gbdc")
 71 | 
 72 |     def checksani_feature_names():
 73 |         return []
 74 | 
 75 | 
 76 | ## GBDHash
 77 | def compute_hash(hash, path, limits):
 78 |     eprint("Hashing {}".format(path))
 79 |     hash = identify(path)
 80 |     return [("local", hash, path), ("filename", hash, os.path.basename(path))]
 81 | 
 82 | 
 83 | ## ISOHash
 84 | def compute_isohash(hash, path, limits):
 85 |     eprint("Computing ISOHash for {}".format(path))
 86 |     context = get_context_by_suffix(path)
 87 |     if context == "wcnf":
 88 |         ihash = wcnfisohash(path)
 89 |     else:
 90 |         ihash = isohash(path)
 91 |     return [("isohash", hash, ihash)]
 92 | 
 93 | 
 94 | ## Base Features
 95 | def compute_base_features(hash, path, limits, tp=None):
 96 |     eprint("Extracting base features from {} {}".format(hash, path))
 97 |     rec = extract_base_features(path, limits["tlim"], limits["mlim"])
 98 |     return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()]
 99 | 
100 | 
101 | ## Gate Features
102 | def compute_gate_features(hash, path, limits, tp=None):
103 |     eprint("Extracting gate features from {} {}".format(hash, path))
104 |     rec = extract_gate_features(path, limits["tlim"], limits["mlim"])
105 |     return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()]
106 | 
107 | 
108 | ## WCNF Base Features
109 | def compute_wcnf_base_features(hash, path, limits, tp=None):
110 |     eprint("Extracting WCNF base features from {} {}".format(hash, path))
111 |     rec = extract_wcnf_base_features(path, limits["tlim"], limits["mlim"])
112 |     return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()]
113 | 
114 | 
115 | ## OPB Base Features
116 | def compute_opb_base_features(hash, path, limits, tp=None):
117 |     eprint("Extracting OPB base features from {} {}".format(hash, path))
118 |     rec = extract_opb_base_features(path, limits["tlim"], limits["mlim"])
119 |     return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()]
120 | 
121 | 
122 | ## SANI Features
123 | def compute_sani_features(hash, path, limits, tp=None):
124 |     eprint("Extracting SANI features from {} {}".format(hash, path))
125 |     rec = checksani(path, limits["tlim"], limits["mlim"])
126 |     return [(key, hash, int(value) if isinstance(value, float) and value.is_integer() else value) for key, value in rec.items()]
127 | 
128 | 
129 | generic_extractors = {
130 |     "base": {
131 |         "description": "Extract base features from CNF files. ",
132 |         "contexts": ["cnf", "sancnf"],
133 |         "features": [(name, "empty") for name in base_feature_names()],
134 |         "compute": compute_base_features,
135 |     },
136 |     "checksani": {
137 |         "description": "Extract sanitise status from CNF files. ",
138 |         "contexts": ["cnf", "sancnf"],
139 |         "features": [(name, "empty") for name in checksani_feature_names()],
140 |         "compute": compute_sani_features,
141 |     },
142 |     "gate": {
143 |         "description": "Extract gate features from CNF files. ",
144 |         "contexts": ["cnf", "sancnf"],
145 |         "features": [(name, "empty") for name in gate_feature_names()],
146 |         "compute": compute_gate_features,
147 |     },
148 |     "isohash": {
149 |         "description": "Compute ISOHash for CNF or WCNF files. ",
150 |         "contexts": ["cnf", "wcnf", "sancnf"],
151 |         "features": [("isohash", "empty")],
152 |         "compute": compute_isohash,
153 |     },
154 |     "wcnfbase": {
155 |         "description": "Extract base features from WCNF files. ",
156 |         "contexts": ["wcnf"],
157 |         "features": [(name, "empty") for name in wcnf_base_feature_names()],
158 |         "compute": compute_wcnf_base_features,
159 |     },
160 |     "opbbase": {
161 |         "description": "Extract base features from OPB files. ",
162 |         "contexts": ["opb"],
163 |         "features": [(name, "empty") for name in opb_base_feature_names()],
164 |         "compute": compute_opb_base_features,
165 |     },
166 | }
167 | 
168 | 
169 | def init_features_generic(key: str, api: GBD, rlimits, df: pl.DataFrame, target_db):
170 |     einfo = generic_extractors[key]
171 |     context = api.database.dcontext(target_db)
172 |     if not context in einfo["contexts"]:
173 |         raise InitializerException("Target database context must be in {}".format(einfo["contexts"]))
174 |     extractor = Initializer(api, rlimits, target_db, einfo["features"], einfo["compute"])
175 |     extractor.create_features()
176 |     extractor.run(df)
177 | 
178 | 
179 | def init_local(api: GBD, rlimits, root, target_db):
180 |     context = api.database.dcontext(target_db)
181 | 
182 |     features = [("local", None), ("filename", None)]
183 |     extractor = Initializer(api, rlimits, target_db, features, compute_hash)
184 |     extractor.create_features()
185 | 
186 |     # Cleanup stale entries
187 |     df: pl.DataFrame = api.query(group_by=context + ":local", collapse=None)
188 |      
189 |     def path_exists(p):
190 |         return p is not None and os.path.exists(p)
191 |     
192 |     missing = df.with_columns(
193 |         exists=pl.col("local").map_elements(
194 |             path_exists,
195 |             return_dtype=pl.Boolean
196 |         )
197 |     ).filter(~pl.col("exists")).select("local")
198 |     
199 |     if len(missing) and api.verbose:
200 |         for path in missing["local"].to_list():
201 |             eprint(path)
202 |     if len(missing) and confirm("{} files not found. Remove stale entries from local table?".format(len(missing))):
203 |         api.reset_values("local", values=missing["local"].to_list())
204 | 
205 |     # Create df with paths not yet in local table
206 |     paths = [path for suffix in suffixes(context) for path in glob.iglob(root + "/**/*" + suffix, recursive=True)]
207 |     df2 = pl.DataFrame([(None, path) for path in paths if path not in df["local"].to_list()], schema=["hash", "local"], orient="row")
208 | 
209 |     extractor.run(df2)
210 | 


--------------------------------------------------------------------------------
/gbd_server/server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | # MIT License
  4 | 
  5 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  6 | 
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | 
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | 
 17 | from logging.handlers import TimedRotatingFileHandler
 18 | import os
 19 | import re
 20 | import polars as pl
 21 | 
 22 | import flask
 23 | import logging
 24 | import waitress
 25 | from werkzeug.middleware.proxy_fix import ProxyFix
 26 | 
 27 | from gbd_core.database import DatabaseException
 28 | from gbd_core.api import GBD, GBDException
 29 | from gbd_core.grammar import ParserException
 30 | from gbd_core.util import is_number
 31 | from gbd_core import contexts
 32 | 
 33 | app = flask.Flask(__name__)
 34 | 
 35 | 
 36 | def request_query(request):
 37 |     query = ""
 38 |     if "query" in request.values:
 39 |         query = request.values.get("query")
 40 |     elif len(request.args) > 0:
 41 |         query = " and ".join(["{}={}".format(key, value) for (key, value) in request.args.items() if key != "context"])
 42 |     return query
 43 | 
 44 | 
 45 | def request_database(request):
 46 |     if "selected_db" in request.values and request.values.get("selected_db") in app.config["dbnames"]:
 47 |         dbname = request.values.get("selected_db")
 48 |         context = request_context(request)
 49 |         if dbname in [GBD.get_database_name(c) for c in app.config["contextdbs"][context]]:
 50 |             return dbname
 51 |         else:
 52 |             return GBD.get_database_name(app.config["contextdbs"][context][0])
 53 |     else:
 54 |         return app.config["dbnames"][0]
 55 | 
 56 | 
 57 | def request_page(request):
 58 |     return int(request.values.get("page")) if "page" in request.values else 0
 59 | 
 60 | 
 61 | def request_action(request):
 62 |     return request.values.get("action") if "action" in request.values else "default"
 63 | 
 64 | 
 65 | def request_context(request):
 66 |     return request.values.get("context") if "context" in request.values else contexts.default_context()
 67 | 
 68 | 
 69 | def query_to_name(query):
 70 |     return re.sub(r"[^\w]", "_", query) if query else "allinstances"
 71 | 
 72 | 
 73 | def error_response(msg, addr, errno=404):
 74 |     app.logger.error("{}: {}".format(addr, msg))
 75 |     return flask.Response(msg, status=errno, mimetype="text/plain")
 76 | 
 77 | 
 78 | def file_response(text_blob, filename, mimetype, addr):
 79 |     app.logger.info("{}: Sending generated file {}".format(addr, filename))
 80 |     return flask.Response(text_blob, mimetype=mimetype, headers={"Content-Disposition": 'attachment; filename="{}"'.format(filename), "filename": filename})
 81 | 
 82 | 
 83 | def path_response(path, filename, mimetype, addr):
 84 |     app.logger.info("{}: Sending file {}".format(addr, path))
 85 |     return flask.send_file(path, as_attachment=True, download_name=filename, mimetype=mimetype)
 86 | 
 87 | 
 88 | def json_response(json_blob, msg, addr):
 89 |     app.logger.info("{}: {}".format(addr, msg))
 90 |     return flask.Response(json_blob, status=200, mimetype="application/json")
 91 | 
 92 | 
 93 | def page_response(context, query, database, page=0):
 94 |     with GBD(app.config["contextdbs"][context]) as gbd:
 95 |         start = page * 1000
 96 |         end = start + 1000
 97 |         error = None
 98 |         try:
 99 |             df: pl.DataFrame = gbd.query(query, resolve=["{}:{}".format(database, f) for f in app.config["features"][database]], collapse="GROUP_CONCAT")
100 |         except GBDException as err:
101 |             error = "GBDException: {}".format(str(err))
102 |         except DatabaseException as err:
103 |             error = "DatabaseException: {}".format(str(err))
104 |         except ParserException as err:
105 |             error = "ParserException: {}".format(str(err))
106 |         except Exception as err:
107 |             error = "An Unhandled Exception Occurred"
108 |         return flask.render_template(
109 |             "index.html",
110 |             context=context,
111 |             error=error,
112 |             contexts=app.config["contexts"],
113 |             query=query,
114 |             query_name=query_to_name(query),
115 |             # result=df.iloc[start:end, :].values.tolist() if error is None else [],
116 |             result=(
117 |                 [list(r) for r in df.slice(start, end - start).rows()]
118 |                 if error is None
119 |                 else []
120 |             ),
121 |             total=len(df) if error is None else 0,
122 |             page=page,
123 |             pages=int(len(df) / 1000) + 1 if error is None else 0,
124 |             selected=database,
125 |             features=app.config["features"][database],
126 |             databases=[gbd.get_database_name(db) for db in app.config["contextdbs"][context]],
127 |             action=request_action(flask.request),
128 |         )
129 | 
130 | 
131 | # Returns main index page
132 | @app.route("/", methods=["POST", "GET"])
133 | def quick_search():
134 |     context = request_context(flask.request)
135 |     query = request_query(flask.request)
136 |     database = request_database(flask.request)
137 |     context_databases = [GBD.get_database_name(db) for db in app.config["contextdbs"][context]]
138 |     if not database in context_databases:
139 |         database = context_databases[0]
140 |     page = request_page(flask.request)
141 |     return page_response(context, query, database, page)
142 | 
143 | 
144 | # Generates a list of URLs. Given query (text field of POST form) is executed and the hashes of the result are resolved
145 | # against the filename feature. Every filename is associated with a URL to enable flexible downloading of these files
146 | @app.route("/getinstances/", methods=["POST", "GET"])
147 | @app.route("/getinstances", methods=["POST", "GET"])
148 | def get_url_file():
149 |     context = request_context(flask.request)
150 |     with GBD(app.config["contextdbs"][context]) as gbd:
151 |         query = request_query(flask.request)
152 |         try:
153 |             df: pl.DataFrame = gbd.query(query)
154 |         except (GBDException, DatabaseException, ParserException) as err:
155 |             return error_response("{}, {}".format(type(err), str(err)), flask.request.remote_addr, errno=500)
156 |         if context == "cnf":
157 |             content = "\n".join([flask.url_for("get_file", hashvalue=val, _external=True) for val in df["hash"].to_list()])
158 |         else:
159 |             content = "\n".join([flask.url_for("get_file", hashvalue=val, context=context, _external=True) for val in df["hash"].to_list()])
160 |         return file_response(content, query_to_name(query) + ".uri", "text/uri-list", flask.request.remote_addr)
161 | 
162 | 
163 | # Send database file
164 | @app.route("/getdatabase/")
165 | @app.route("/getdatabase")
166 | @app.route("/getdatabase/<database>/")
167 | @app.route("/getdatabase/<database>")
168 | def get_database_file(database=None):
169 |     dbname = database if database and database in app.config["dbnames"] else app.config["dbnames"][0]
170 |     dbpath = app.config["dbpaths"][dbname]
171 |     return path_response(dbpath, os.path.basename(dbpath), "application/x-sqlite3", flask.request.remote_addr)
172 | 
173 | 
174 | # Find the file corresponding to the hashvalue and send it to the client
175 | @app.route("/file/<hashvalue>/")
176 | @app.route("/file/<hashvalue>")
177 | def get_file(hashvalue):
178 |     context = request_context(flask.request)
179 |     print(context, app.config["contextdbs"][context])
180 |     with GBD(app.config["contextdbs"][context]) as gbd:
181 |         df: pl.DataFrame = gbd.query(hashes=[hashvalue], resolve=["local", "filename"], collapse="MIN")
182 |         if not len(df):
183 |             return error_response("Hash '{}' not found".format(hashvalue), flask.request.remote_addr)
184 |         row = df.to_dicts()[0]
185 |         if not os.path.exists(row["local"]):
186 |             return error_response("Files temporarily not accessible", flask.request.remote_addr)
187 |         return path_response(row["local"], row["hash"] + "-" + row["filename"], "application/x-xz", flask.request.remote_addr)
188 | 
189 | 
190 | # start the server
191 | def serve(gbd: GBD, port: int = 5000, logdir: str = "/tmp"):
192 |     formatter = logging.Formatter(
193 |         fmt="[%(asctime)s, %(name)s, %(levelname)s] %(module)s.%(filename)s.%(funcName)s():%(lineno)d\n%(message)s", datefmt="%Y-%m-%d %H:%M:%S"
194 |     )
195 |     logging.getLogger().setLevel(logging.DEBUG)
196 |     # Add sys.stdout to logging output
197 |     console_handler = logging.StreamHandler()
198 |     console_handler.setFormatter(formatter)
199 |     console_handler.setLevel(logging.INFO)
200 |     logging.getLogger().addHandler(console_handler)
201 |     # Add handler to write in rotating logging files
202 |     file_handler = TimedRotatingFileHandler(logdir + "/trfile.log", when="midnight", backupCount=10)
203 |     file_handler.setFormatter(formatter)
204 |     file_handler.setLevel(logging.WARNING)
205 |     logging.getLogger().addHandler(file_handler)
206 | 
207 |     global app
208 |     app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1)
209 | 
210 |     app.jinja_env.trim_blocks = True
211 |     app.jinja_env.lstrip_blocks = True
212 | 
213 |     app.jinja_env.tests["link_field"] = lambda field: field is not None and field.startswith("http")
214 |     app.jinja_env.tests["num_field"] = lambda field: field is not None and is_number(field)
215 |     app.jinja_env.tests["int_field"] = lambda field: field is not None and field.isnumeric()
216 | 
217 |     path = os.path.dirname(__file__)
218 |     app.static_folder = os.path.join(path, "static")
219 |     app.template_folder = os.path.join(path, "templates")
220 | 
221 |     app.config["contexts"] = gbd.get_contexts()
222 |     app.config["dbnames"] = gbd.get_databases()
223 |     # group databases by context
224 |     app.config["contextdbs"] = dict()
225 |     for ctxt in app.config["contexts"]:
226 |         app.config["contextdbs"][ctxt] = [gbd.get_database_path(c) for c in gbd.get_databases(ctxt)]
227 |     # group features by database
228 |     app.config["dbpaths"] = dict()
229 |     app.config["features"] = dict()
230 |     for db in app.config["dbnames"]:
231 |         app.config["features"][db] = [f for f in gbd.get_features(db) if not f in ["hash", "local"]]
232 |         app.config["dbpaths"][db] = gbd.get_database_path(db)
233 |     app.config["features_flat"] = [f for f in gbd.get_features() if not f in ["hash", "local"]]
234 | 
235 |     waitress.serve(app, host="0.0.0.0", port=port)
236 | 


--------------------------------------------------------------------------------
/gbd_core/api.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | 
 16 | import sqlite3
 17 | import tatsu
 18 | import polars as pl
 19 | 
 20 | from contextlib import ExitStack
 21 | import traceback
 22 | 
 23 | from gbd_core.query import GBDQuery
 24 | from gbd_core.database import Database
 25 | from gbd_core.database import Schema
 26 | from gbd_core import util
 27 | 
 28 | 
 29 | class GBDException(Exception):
 30 |     pass
 31 | 
 32 | 
 33 | class GBD:
 34 |     # Create a new GBD object which operates on the given databases
 35 |     def __init__(self, dbs: list, verbose: bool = False):
 36 |         assert isinstance(dbs, list)
 37 |         self.database = Database(dbs, verbose)
 38 |         self.verbose = verbose
 39 | 
 40 |     def __enter__(self):
 41 |         with ExitStack() as stack:
 42 |             stack.enter_context(self.database)
 43 |             self._stack = stack.pop_all()
 44 |         return self
 45 | 
 46 |     def __exit__(self, exc_type, exc, traceback):
 47 |         self._stack.__exit__(exc_type, exc, traceback)
 48 | 
 49 |     @classmethod
 50 |     def identify(cls, path):
 51 |         """Identify the given benchmark by its GBD hash
 52 | 
 53 |         Args:
 54 |         path (str): path to benchmark
 55 | 
 56 |         Returns:
 57 |         str: GBD hash
 58 |         """
 59 |         from gbd_core.contexts import identify
 60 | 
 61 |         return identify(path)
 62 | 
 63 |     def query(self, gbd_query=None, hashes=[], resolve=[], collapse="group_concat", group_by=None, join_type="LEFT") -> pl.DataFrame:
 64 |         """Query the database
 65 | 
 66 |         Args:
 67 |         gbd_query (str): GBD query string
 68 |         hashes (list): list of hashes (=benchmark ids), the query is restricted to
 69 |         resolve (list): list of features to be resolved
 70 |         collapse (str): collapse function: min, max, avg, count, sum, group_concat, or none
 71 |         group_by (str): group results by that feature instead of hash (default)
 72 |         join_type (str): join type: left or inner
 73 | 
 74 |         Returns:
 75 |         polars.DataFrame: query result
 76 |         """
 77 |         query_builder = GBDQuery(self.database, gbd_query)
 78 |         try:
 79 |             sql = query_builder.build_query(hashes, resolve, group_by, join_type, collapse)
 80 |         except tatsu.exceptions.FailedParse as err:
 81 |             if self.verbose:
 82 |                 util.eprint(traceback.format_exc())
 83 |             raise GBDException("Parser Error with Query '{}': {}".format(gbd_query, str(err)))
 84 |         try:
 85 |             result = self.database.query(sql)
 86 |         except sqlite3.OperationalError as err:
 87 |             if self.verbose:
 88 |                 util.eprint(traceback.format_exc())
 89 |             raise GBDException("Database Operational Error: {}".format(str(err)))
 90 |         group = group_by or query_builder.determine_group_by(resolve)
 91 |         cols = [p.split(":") for p in [group] + resolve]
 92 |         cols = [c[0] if len(c) == 1 else c[1] for c in cols]
 93 |         return pl.DataFrame(result, schema=cols, orient="row")
 94 | 
 95 |     def set_values(self, name, value, hashes, target_db=None):
 96 |         """Set feature value for given hashes
 97 | 
 98 |         Args:
 99 |         name (str): feature name
100 |         value (str): value to be set
101 |         hashes (list): list of hashes (=benchmark ids)
102 |         target_db (str, optional): name of target database
103 |         if None, default database (first in list) is used
104 |         Raises:
105 |         GBDException, if feature does not exist
106 |         """
107 |         if not self.feature_exists(name, target_db):
108 |             raise GBDException("Feature '{}' does not exist".format(name))
109 |         if not len(hashes):
110 |             raise GBDException("No hashes given")
111 |         self.database.set_values(name, value, hashes, target_db)
112 | 
113 |     def reset_values(self, feature, values=[], hashes=[], target_db=None):
114 |         """Reset feature value for given hashes
115 | 
116 |         Args:
117 |         feature (str): feature name
118 |         values (list, optional): list of values to be reset
119 |         hashes (list, optional): list of hashes (=benchmark ids) to be reset
120 |         target_db (str, optional): name of target database
121 |         if None, default database (first in list) is used
122 | 
123 |         Raises:
124 |         GBDException, if feature does not exist
125 |         """
126 |         if not self.feature_exists(feature, target_db):
127 |             raise GBDException("Feature '{}' does not exist".format(feature))
128 |         if len(values) and len(hashes):
129 |             for values_slice in util.slice_iterator(values, 10):
130 |                 for hashes_slice in util.slice_iterator(hashes, 10):
131 |                     self.database.delete(feature, values_slice, hashes_slice, target_db)
132 |                     self.database.commit()
133 |         elif len(values):
134 |             for values_slice in util.slice_iterator(values, 10):
135 |                 self.database.delete(feature, values_slice, [], target_db)
136 |                 self.database.commit()
137 |         elif len(hashes):
138 |             for hashes_slice in util.slice_iterator(hashes, 10):
139 |                 self.database.delete(feature, [], hashes_slice, target_db)
140 |                 self.database.commit()
141 | 
142 |     def delete_hashes(self, hashes, target_db=None):
143 |         """Delete all values for given hashes
144 | 
145 |         Args:
146 |         hashes (list): list of hashes (=benchmark ids) to be deleted
147 |         target_db (str, optional): name of target database
148 |         if None, default database (first in list) is used
149 | 
150 |         Raises:
151 |         GBDException, if feature does not exist
152 |         """
153 |         if not len(hashes):
154 |             raise GBDException("No hashes given")
155 |         self.database.delete_hashes_entirely(hashes, target_db)
156 | 
157 |     def get_databases(self, context=None):
158 |         """Get list of database names
159 | 
160 |         Returns: list of database names
161 |         """
162 |         if context is None:
163 |             return list(self.database.get_databases())
164 |         else:
165 |             return [db for db in self.database.get_databases() if self.database.dcontext(db) == context]
166 | 
167 |     def get_database_path(self, dbname):
168 |         """Get path for given database name
169 | 
170 |         Args:
171 |         dbname (str): name of database
172 | 
173 |         Returns: path to database
174 |         """
175 |         return self.database.dpath(dbname)
176 | 
177 |     @classmethod
178 |     def get_database_name(self, path):
179 |         """Get database name for given path
180 | 
181 |         Args:
182 |         path (str): path to database
183 | 
184 |         Returns: name of database
185 |         """
186 |         return Schema.dbname_from_path(path)
187 | 
188 |     def get_contexts(self, dbs=[]):
189 |         """Get list of contexts
190 | 
191 |         Returns: list of contexts
192 |         """
193 |         if not len(dbs):
194 |             return list(self.database.get_contexts())
195 |         else:
196 |             return list(set([self.database.dcontext(db) for db in dbs]))
197 | 
198 |     def get_feature_info(self, fname):
199 |         """Retrieve information about a specific feature"""
200 |         finfo = self.database.find(fname)
201 |         df: pl.DataFrame = self.query(resolve=[fname], collapse=None)
202 |         
203 |         min_value = sorted(pl.Series(df[fname]).to_list())[0]
204 |         max_value = sorted(pl.Series(df[fname]).to_list(), reverse=True)[0]
205 |         return {
206 |             "feature": fname,
207 |             "count": len(df),
208 |             "default": finfo.default,
209 |             "num-min": min_value,
210 |             "num-max": max_value,
211 |             "strings": " ".join(sorted([val for val in df[fname].unique() if val and not util.is_number(val)])),
212 |         }
213 | 
214 |     def get_features(self, dbname: str = None):
215 |         """Get features from the database.
216 | 
217 |         Args:
218 |         dbname (str): name of feature database
219 |         if None, feature list is accumulated over all databases
220 | 
221 |         Returns: list of features names
222 |         """
223 |         lst = self.database.get_features([] if not dbname else [dbname])
224 |         while "hash" in lst:
225 |             lst.remove("hash")
226 |         return lst
227 | 
228 |     def feature_exists(self, name, dbname=None):
229 |         """Check if feature exists in the database.
230 | 
231 |         Args:
232 |         name (str): name of feature
233 |         dbname (str): name of feature database
234 |         if None, feature existence is checked for in all databases
235 | 
236 |         Returns: True if feature exists in dbname or any database, False otherwise
237 |         """
238 |         return name in self.get_features(dbname)
239 | 
240 |     def create_feature(self, name: str, default_value: str = None, target_db: str = None):
241 |         """Creates feature with given name
242 | 
243 |         Args:
244 |         name (str): feature name
245 |         default_value (str): default value for 1:1 features
246 |         if None, a multi-valued (1:n) feature is created
247 |         target_db (str): database name
248 |         if None, default database (fist in list) is used
249 | 
250 |         Returns: None
251 | 
252 |         Raises:
253 |         GBDException, if feature already exists in target_db
254 |         """
255 |         if not self.feature_exists(name, target_db):
256 |             self.database.create_feature(name, default_value, target_db, False)
257 |         else:
258 |             raise GBDException("Feature '{}' does already exist".format(name))
259 | 
260 |     def delete_feature(self, name, target_db=None):
261 |         """Deletes feature with given name
262 | 
263 |         Args:
264 |         name (str): feature name
265 |         target_db (str): database name
266 |         if None, default database (fist in list) is used
267 | 
268 |         Returns: None
269 | 
270 |         Raises:
271 |         GBDException, if feature does not exist in target_db
272 |         """
273 |         if self.feature_exists(name, target_db):
274 |             self.database.delete_feature(name, target_db)
275 |         else:
276 |             raise GBDException("Feature '{}' does not exist".format(name))
277 | 
278 |     def rename_feature(self, old_name, new_name, target_db=None):
279 |         """Renames feature with given name
280 | 
281 |         Args:
282 |         old_name (str): old feature name
283 |         new_name (str): new feature name
284 |         target_db (str): database name
285 |         if None, default database (fist in list) is used
286 | 
287 |         Returns: None
288 | 
289 |         Raises:
290 |         GBDException,
291 |         - if feature 'old_name' does not exist in target_db
292 |         - if feature 'new_name' already exists in target_db
293 |         """
294 |         if not self.feature_exists(old_name, target_db):
295 |             raise GBDException("Feature '{}' does not exist".format(old_name))
296 |         elif self.feature_exists(new_name, target_db):
297 |             raise GBDException("Feature '{}' does already exist".format(new_name))
298 |         else:
299 |             self.database.rename_feature(old_name, new_name, target_db)
300 | 
301 |     def copy_feature(self, old_name, new_name, target_db=None, gbd_query=None, hashes=[]):
302 |         """Copies feature with given name
303 | 
304 |         Args:
305 |         old_name (str): old feature name
306 |         new_name (str): new feature name
307 |         target_db (str): name of database to copy feature to
308 |         if None, default database (fist in list) is used
309 | 
310 |         Returns: None
311 |         """
312 |         if not self.feature_exists(old_name):
313 |             raise GBDException("Feature '{}' does not exist".format(old_name))
314 | 
315 |         if not self.feature_exists(new_name, target_db):
316 |             self.create_feature(new_name, target_db=target_db)
317 | 
318 |         hashes = self.query(gbd_query=gbd_query, hashes=hashes)["hash"].to_list()
319 | 
320 |         self.database.copy_feature(old_name, new_name, target_db, hashes)
321 | 


--------------------------------------------------------------------------------
/gbd_core/database.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | import sqlite3
 16 | import typing
 17 | 
 18 | from pprint import pprint
 19 | 
 20 | from gbd_core.util import eprint
 21 | from gbd_core.schema import Schema, FeatureInfo
 22 | from gbd_core import contexts
 23 | 
 24 | 
 25 | class DatabaseException(Exception):
 26 |     pass
 27 | 
 28 | 
 29 | class Database:
 30 |     def __init__(self, path_list: list, verbose=False, autocommit=True):
 31 |         self.verbose = verbose
 32 |         self.schemas = self.init_schemas(path_list)
 33 |         self.features = self.init_features()
 34 |         self.connection = sqlite3.connect("file::memory:?cache=shared", uri=True, timeout=10)
 35 |         self.cursor = self.connection.cursor()
 36 |         self.maindb = None
 37 |         self.autocommit = autocommit
 38 |         schema: Schema
 39 |         for schema in self.schemas.values():
 40 |             if not schema.is_in_memory():
 41 |                 self.execute("ATTACH DATABASE '{}' AS {}".format(schema.path, schema.dbname))
 42 |             else:
 43 |                 self.execute("ATTACH DATABASE 'file:{}?mode=memory&cache=shared' AS {}".format(schema.dbname, schema.dbname))
 44 |             # first database is the default database:
 45 |             if not self.maindb:
 46 |                 self.maindb = schema.dbname
 47 | 
 48 |     def __enter__(self):
 49 |         return self
 50 | 
 51 |     def __exit__(self, exception_type, exception_value, traceback):
 52 |         self.connection.commit()
 53 |         self.connection.close()
 54 | 
 55 |     # returns major version of sqlite3 as float
 56 |     @classmethod
 57 |     def sqlite3_version(cls):
 58 |         return float(sqlite3.sqlite_version.rsplit(".", 1)[0])
 59 | 
 60 |     def init_schemas(self, path_list) -> typing.Dict[str, Schema]:
 61 |         result = dict()
 62 |         for path in path_list:
 63 |             schema = Schema.create(path)
 64 |             if not schema.dbname in result:
 65 |                 result[schema.dbname] = schema
 66 |             elif schema.is_in_memory():
 67 |                 result[schema.dbname].absorb(schema)
 68 |             else:
 69 |                 raise DatabaseException("Database name collision on " + schema.dbname)
 70 |         return result
 71 | 
 72 |     # return a dictionary which maps feature names to feature infos
 73 |     def init_features(self) -> typing.Dict[str, FeatureInfo]:
 74 |         result = dict()
 75 |         schema: Schema
 76 |         for schema in self.schemas.values():
 77 |             feature: FeatureInfo
 78 |             for feature in schema.features.values():
 79 |                 # first found feature is used: (=feature precedence by database position)
 80 |                 if not feature.name in result:
 81 |                     result[feature.name] = [feature]
 82 |                 elif feature.column == "hash" and feature.table == "features":
 83 |                     # first found features table is the one that serves the hash
 84 |                     if result[feature.name][0].table != "features":
 85 |                         result[feature.name].insert(0, feature)
 86 |                     else:
 87 |                         result[feature.name].append(feature)
 88 |                 else:
 89 |                     result[feature.name].append(feature)
 90 |         return result
 91 | 
 92 |     def query(self, q):
 93 |         if self.verbose:
 94 |             eprint(q)
 95 |         return self.cursor.execute(q).fetchall()
 96 | 
 97 |     def execute(self, q):
 98 |         if self.verbose:
 99 |             eprint(q)
100 |         self.cursor.execute(q)
101 |         if self.autocommit:
102 |             self.commit()
103 | 
104 |     def commit(self):
105 |         self.connection.commit()
106 | 
107 |     def set_auto_commit(self, autocommit):
108 |         self.autocommit = autocommit
109 | 
110 |     def dexists(self, dbname):
111 |         return dbname in self.schemas.keys()
112 | 
113 |     def dmain(self, dbname):
114 |         return dbname == self.maindb
115 | 
116 |     def dpath(self, dbname):
117 |         if not dbname in self.schemas:
118 |             raise DatabaseException("Database '{}' not found".format(dbname))
119 |         return self.schemas[dbname].path
120 | 
121 |     def dcontext(self, dbname):
122 |         if not dbname in self.schemas:
123 |             raise DatabaseException("Database '{}' not found".format(dbname))
124 |         return self.schemas[dbname].context
125 | 
126 |     def dtables(self, dbname):
127 |         if not dbname in self.schemas:
128 |             raise DatabaseException("Database '{}' not found".format(dbname))
129 |         return self.schemas[dbname].get_tables()
130 | 
131 |     def finfo(self, fname, db=None):
132 |         if fname in self.features and len(self.features[fname]) > 0:
133 |             if db is None:
134 |                 return self.features[fname][0]
135 |             else:
136 |                 infos = [info for info in self.features[fname] if info.database == db]
137 |                 if len(infos) == 0:
138 |                     raise DatabaseException("Feature '{}' does not exists in database {}".format(fname, db))
139 |                 return infos[0]
140 |         else:
141 |             raise DatabaseException("Feature '{}' does not exists".format(fname))
142 | 
143 |     def faddr_column(self, feature):
144 |         finfo = self.find(feature)
145 |         return "{}.{}.{}".format(finfo.database, finfo.table, finfo.column)
146 | 
147 |     def faddr_table(self, feature):
148 |         finfo = self.find(feature)
149 |         return "{}.{}".format(finfo.database, finfo.table)
150 | 
151 |     def find(self, fid: str, db: str = None):
152 |         """Find feature by name or feature identifier
153 | 
154 |         Args:
155 |         fid: feature identifier, of the form "database:feature", "context:feature" or "feature"
156 |         db: database name (optional), if given fid is unique without database: or context: prefix
157 | 
158 |         Returns:
159 |         FeatureInfo object: the info object for the first found feature
160 |         feature precedence is according to the order of databases in the path list
161 |         ambiguity can be resolved by using one of the following methods.
162 |         - by giving a database name as the second argument or
163 |         - by using the fid syntax "database:feature"
164 |         - by using the fid syntax "context:feature" (note that this does not necessarily resolve all ambiguity)
165 | 
166 |         Raises:
167 |         DatabaseException: if feature is not found or given database info is ambiguous
168 |         """
169 |         parts = fid.split(":")
170 |         if db is not None:
171 |             if len(parts) > 1:
172 |                 if parts[0] != db:
173 |                     raise DatabaseException("Ambiguous database identifiers: '{}' and '{}'".format(parts[0], db))
174 |                 else:
175 |                     return self.finfo(parts[1], parts[0])
176 |             return self.finfo(fid, db)
177 |         elif len(parts) == 1:
178 |             return self.finfo(fid)
179 |         elif parts[0] in self.get_databases():
180 |             return self.finfo(parts[1], parts[0])
181 |         elif parts[0] in self.get_contexts():
182 |             db = self.get_databases(parts[0])[0]
183 |             return self.finfo(parts[1], db)
184 |         else:
185 |             raise DatabaseException("Feature '{}' not found".format(fid))
186 | 
187 |     def faddr(self, fid: str, with_column=True):
188 |         finfo = self.find(fid)
189 | 
190 |         if with_column:
191 |             return "{}.{}.{}".format(finfo.database, finfo.table, finfo.column)
192 |         else:
193 |             return "{}.{}".format(finfo.database, finfo.table)
194 | 
195 |     def get_databases(self, context: str = None):
196 |         return [dbname for (dbname, schema) in self.schemas.items() if not context or context == schema.context]
197 | 
198 |     def get_contexts(self, dbs=[]):
199 |         return list(set([s.context for s in self.schemas.values() if not dbs or s.dbname in dbs]))
200 | 
201 |     def get_features(self, dbs=[]):
202 |         return [name for (name, infos) in self.features.items() for info in infos if not dbs or info.database in dbs]
203 | 
204 |     def get_tables(self, dbs=[]):
205 |         tables = [info.table for infos in self.features.values() for info in infos if not dbs or info.database in dbs]
206 |         return list(set(tables))
207 | 
208 |     def create_feature(self, name, default_value=None, target_db=None, permissive=False):
209 |         db = target_db or self.maindb
210 |         created = self.schemas[db].create_feature(name, default_value, permissive)
211 |         for finfo in created:
212 |             if not finfo.name in self.features.keys():
213 |                 self.features[finfo.name] = [finfo]
214 |             else:
215 |                 # this code disregards feature precedence by database position:
216 |                 self.features[finfo.name].append(finfo)
217 | 
218 |     def set_values(self, fname, value, hashes, target_db=None):
219 |         finfo = self.finfo(fname, target_db)
220 |         self.schemas[finfo.database].set_values(fname, value, hashes)
221 | 
222 |     def rename_feature(self, fname, new_fname, target_db=None):
223 |         Schema.valid_feature_or_raise(new_fname)
224 |         finfo = self.finfo(fname, target_db)
225 |         self.execute("ALTER TABLE {}.features RENAME COLUMN {} TO {}".format(finfo.database, fname, new_fname))
226 |         if finfo.default is None:
227 |             con = sqlite3.connect(self.schemas[finfo.database].path)
228 |             with con as cursor:
229 |                 cursor.execute("ALTER TABLE {} RENAME TO {}".format(fname, new_fname))
230 |             con.close()
231 |         self.features[fname].remove(finfo)
232 |         if not len(self.features[fname]):
233 |             del self.features[fname]
234 |         finfo.name = new_fname
235 |         if not new_fname in self.features.keys():
236 |             self.features[new_fname] = [finfo]
237 |         else:
238 |             # this code disregards feature precedence by database position:
239 |             self.features[new_fname].append(finfo)
240 | 
241 |     def delete_feature(self, fname, target_db=None):
242 |         finfo = self.finfo(fname, target_db)
243 |         if finfo.default is None:
244 |             self.execute("DROP TABLE IF EXISTS {}.{}".format(finfo.database, fname))
245 |         elif Database.sqlite3_version() >= 3.35:
246 |             self.execute("ALTER TABLE {}.{} DROP COLUMN {}".format(finfo.database, finfo.table, fname))
247 |         else:
248 |             raise DatabaseException("Cannot delete unique feature {} with SQLite versions < 3.35".format(fname))
249 |         self.features[fname].remove(finfo)
250 |         if not len(self.features[fname]):
251 |             del self.features[fname]
252 | 
253 |     def delete(self, fname, values=[], hashes=[], target_db=None):
254 |         finfo = self.finfo(fname, target_db)
255 |         w1 = "{cl} IN ('{v}')".format(cl=finfo.column, v="', '".join(values))
256 |         w2 = "hash IN ('{h}')".format(h="', '".join(hashes))
257 |         where = "{} AND {}".format(w1 if len(values) else "1=1", w2 if len(hashes) else "1=1")
258 |         db = finfo.database
259 |         if finfo.default is None:
260 |             hashlist = [r[0] for r in self.query("SELECT DISTINCT(hash) FROM {d}.{tab} WHERE {w}".format(d=db, tab=fname, w=where))]
261 |             self.execute("DELETE FROM {d}.{tab} WHERE {w}".format(d=db, tab=fname, w=where))
262 |             remaining = [
263 |                 r[0] for r in self.query("SELECT DISTINCT(hash) FROM {d}.{tab} WHERE hash in ('{h}')".format(d=db, tab=fname, h="', '".join(hashlist)))
264 |             ]
265 |             setnone = [h for h in hashlist if not h in remaining]
266 |             self.execute("UPDATE {d}.features SET {col} = 'None' WHERE hash IN ('{h}')".format(d=db, col=fname, h="', '".join(setnone)))
267 |         else:
268 |             self.execute("UPDATE {d}.features SET {col} = '{default}' WHERE {w}".format(d=db, col=fname, default=finfo.default, w=where))
269 | 
270 |     def delete_hashes_entirely(self, hashes, target_db=None):
271 |         tables = self.get_tables([target_db])
272 |         for table in tables:
273 |             self.execute("DELETE FROM {}.{} WHERE hash IN ('{h}')".format(target_db, table, h="', '".join(hashes)))
274 | 
275 |     def copy_feature(self, old_name, new_name, target_db, hashlist=[]):
276 |         old_finfo = self.find(old_name)
277 |         data = self.query(
278 |             "SELECT hash, {col} FROM {d}.{tab} WHERE hash IN ('{h}')".format(
279 |                 d=old_finfo.database, col=old_finfo.column, tab=old_finfo.table, h="', '".join(hashlist)
280 |             )
281 |         )
282 |         for hash, value in data:
283 |             self.set_values(new_name, value, [hash], target_db)
284 | 


--------------------------------------------------------------------------------
/gbd_server/static/w3.js:
--------------------------------------------------------------------------------
  1 | /* W3.JS 1.04 April 2019 by w3schools.com */
  2 | "use strict";
  3 | var w3 = {};
  4 | w3.hide = function (sel) {
  5 |   w3.hideElements(w3.getElements(sel));
  6 | };
  7 | w3.hideElements = function (elements) {
  8 |   var i, l = elements.length;
  9 |   for (i = 0; i < l; i++) {
 10 |     w3.hideElement(elements[i]);
 11 |   }
 12 | };
 13 | w3.hideElement = function (element) {
 14 |   w3.styleElement(element, "display", "none");
 15 | };
 16 | w3.show = function (sel, a) {
 17 |   var elements = w3.getElements(sel);
 18 |   if (a) {w3.hideElements(elements);}
 19 |   w3.showElements(elements);
 20 | };
 21 | w3.showElements = function (elements) {
 22 |   var i, l = elements.length;
 23 |   for (i = 0; i < l; i++) {
 24 |     w3.showElement(elements[i]);
 25 |   }
 26 | };
 27 | w3.showElement = function (element) {
 28 |   w3.styleElement(element, "display", "block");
 29 | };
 30 | w3.addStyle = function (sel, prop, val) {
 31 |   w3.styleElements(w3.getElements(sel), prop, val);
 32 | };
 33 | w3.styleElements = function (elements, prop, val) {
 34 |   var i, l = elements.length;
 35 |   for (i = 0; i < l; i++) {    
 36 |     w3.styleElement(elements[i], prop, val);
 37 |   }
 38 | };
 39 | w3.styleElement = function (element, prop, val) {
 40 |   element.style.setProperty(prop, val);
 41 | };
 42 | w3.toggleShow = function (sel) {
 43 |   var i, x = w3.getElements(sel), l = x.length;
 44 |   for (i = 0; i < l; i++) {    
 45 |     if (x[i].style.display == "none") {
 46 |       w3.styleElement(x[i], "display", "block");
 47 |     } else {
 48 |       w3.styleElement(x[i], "display", "none");
 49 |     }
 50 |   }
 51 | };
 52 | w3.addClass = function (sel, name) {
 53 |   w3.addClassElements(w3.getElements(sel), name);
 54 | };
 55 | w3.addClassElements = function (elements, name) {
 56 |   var i, l = elements.length;
 57 |   for (i = 0; i < l; i++) {
 58 |     w3.addClassElement(elements[i], name);
 59 |   }
 60 | };
 61 | w3.addClassElement = function (element, name) {
 62 |   var i, arr1, arr2;
 63 |   arr1 = element.className.split(" ");
 64 |   arr2 = name.split(" ");
 65 |   for (i = 0; i < arr2.length; i++) {
 66 |     if (arr1.indexOf(arr2[i]) == -1) {element.className += " " + arr2[i];}
 67 |   }
 68 | };
 69 | w3.removeClass = function (sel, name) {
 70 |   w3.removeClassElements(w3.getElements(sel), name);
 71 | };
 72 | w3.removeClassElements = function (elements, name) {
 73 |   var i, l = elements.length, arr1, arr2, j;
 74 |   for (i = 0; i < l; i++) {
 75 |     w3.removeClassElement(elements[i], name);
 76 |   }
 77 | };
 78 | w3.removeClassElement = function (element, name) {
 79 |   var i, arr1, arr2;
 80 |   arr1 = element.className.split(" ");
 81 |   arr2 = name.split(" ");
 82 |   for (i = 0; i < arr2.length; i++) {
 83 |     while (arr1.indexOf(arr2[i]) > -1) {
 84 |       arr1.splice(arr1.indexOf(arr2[i]), 1);     
 85 |     }
 86 |   }
 87 |   element.className = arr1.join(" ");
 88 | };
 89 | w3.toggleClass = function (sel, c1, c2) {
 90 |   w3.toggleClassElements(w3.getElements(sel), c1, c2);
 91 | };
 92 | w3.toggleClassElements = function (elements, c1, c2) {
 93 |   var i, l = elements.length;
 94 |   for (i = 0; i < l; i++) {    
 95 |     w3.toggleClassElement(elements[i], c1, c2);
 96 |   }
 97 | };
 98 | w3.toggleClassElement = function (element, c1, c2) {
 99 |   var t1, t2, t1Arr, t2Arr, j, arr, allPresent;
100 |   t1 = (c1 || "");
101 |   t2 = (c2 || "");
102 |   t1Arr = t1.split(" ");
103 |   t2Arr = t2.split(" ");
104 |   arr = element.className.split(" ");
105 |   if (t2Arr.length == 0) {
106 |     allPresent = true;
107 |     for (j = 0; j < t1Arr.length; j++) {
108 |       if (arr.indexOf(t1Arr[j]) == -1) {allPresent = false;}
109 |     }
110 |     if (allPresent) {
111 |       w3.removeClassElement(element, t1);
112 |     } else {
113 |       w3.addClassElement(element, t1);
114 |     }
115 |   } else {
116 |     allPresent = true;
117 |     for (j = 0; j < t1Arr.length; j++) {
118 |       if (arr.indexOf(t1Arr[j]) == -1) {allPresent = false;}
119 |     }
120 |     if (allPresent) {
121 |       w3.removeClassElement(element, t1);
122 |       w3.addClassElement(element, t2);          
123 |     } else {
124 |       w3.removeClassElement(element, t2);        
125 |       w3.addClassElement(element, t1);
126 |     }
127 |   }
128 | };
129 | w3.getElements = function (id) {
130 |   if (typeof id == "object") {
131 |     return [id];
132 |   } else {
133 |     return document.querySelectorAll(id);
134 |   }
135 | };
136 | w3.filterHTML = function(id, sel, filter) {
137 |   var a, b, c, i, ii, iii, hit;
138 |   a = w3.getElements(id);
139 |   for (i = 0; i < a.length; i++) {
140 |     b = a[i].querySelectorAll(sel);
141 |     for (ii = 0; ii < b.length; ii++) {
142 |       hit = 0;
143 |       if (b[ii].innerText.toUpperCase().indexOf(filter.toUpperCase()) > -1) {
144 |         hit = 1;
145 |       }
146 |       c = b[ii].getElementsByTagName("*");
147 |       for (iii = 0; iii < c.length; iii++) {
148 |         if (c[iii].innerText.toUpperCase().indexOf(filter.toUpperCase()) > -1) {
149 |           hit = 1;
150 |         }
151 |       }
152 |       if (hit == 1) {
153 |         b[ii].style.display = "";
154 |       } else {
155 |         b[ii].style.display = "none";
156 |       }
157 |     }
158 |   }
159 | };
160 | w3.sortHTML = function(id, sel, sortvalue) {
161 |   var a, b, i, ii, y, bytt, v1, v2, cc, j;
162 |   a = w3.getElements(id);
163 |   for (i = 0; i < a.length; i++) {
164 |     for (j = 0; j < 2; j++) {
165 |       cc = 0;
166 |       y = 1;
167 |       while (y == 1) {
168 |         y = 0;
169 |         b = a[i].querySelectorAll(sel);
170 |         for (ii = 0; ii < (b.length - 1); ii++) {
171 |           bytt = 0;
172 |           if (sortvalue) {
173 |             v1 = b[ii].querySelector(sortvalue).innerText;
174 |             v2 = b[ii + 1].querySelector(sortvalue).innerText;
175 |           } else {
176 |             v1 = b[ii].innerText;
177 |             v2 = b[ii + 1].innerText;
178 |           }
179 |           v1 = v1.toLowerCase();
180 |           v2 = v2.toLowerCase();
181 |           if ((j == 0 && (v1 > v2)) || (j == 1 && (v1 < v2))) {
182 |             bytt = 1;
183 |             break;
184 |           }
185 |         }
186 |         if (bytt == 1) {
187 |           b[ii].parentNode.insertBefore(b[ii + 1], b[ii]);
188 |           y = 1;
189 |           cc++;
190 |         }
191 |       }
192 |       if (cc > 0) {break;}
193 |     }
194 |   }
195 | };
196 | w3.slideshow = function (sel, ms, func) {
197 |   var i, ss, x = w3.getElements(sel), l = x.length;
198 |   ss = {};
199 |   ss.current = 1;
200 |   ss.x = x;
201 |   ss.ondisplaychange = func;
202 |   if (!isNaN(ms) || ms == 0) {
203 |     ss.milliseconds = ms;
204 |   } else {
205 |     ss.milliseconds = 1000;
206 |   }
207 |   ss.start = function() {
208 |     ss.display(ss.current)
209 |     if (ss.ondisplaychange) {ss.ondisplaychange();}
210 |     if (ss.milliseconds > 0) {
211 |       window.clearTimeout(ss.timeout);
212 |       ss.timeout = window.setTimeout(ss.next, ss.milliseconds);
213 |     }
214 |   };
215 |   ss.next = function() {
216 |     ss.current += 1;
217 |     if (ss.current > ss.x.length) {ss.current = 1;}
218 |     ss.start();
219 |   };
220 |   ss.previous = function() {
221 |     ss.current -= 1;
222 |     if (ss.current < 1) {ss.current = ss.x.length;}
223 |     ss.start();
224 |   };
225 |   ss.display = function (n) {
226 |     w3.styleElements(ss.x, "display", "none");
227 |     w3.styleElement(ss.x[n - 1], "display", "block");
228 |   }
229 |   ss.start();
230 |   return ss;
231 | };
232 | w3.includeHTML = function(cb) {
233 |   var z, i, elmnt, file, xhttp;
234 |   z = document.getElementsByTagName("*");
235 |   for (i = 0; i < z.length; i++) {
236 |     elmnt = z[i];
237 |     file = elmnt.getAttribute("w3-include-html");
238 |     if (file) {
239 |       xhttp = new XMLHttpRequest();
240 |       xhttp.onreadystatechange = function() {
241 |         if (this.readyState == 4) {
242 |           if (this.status == 200) {elmnt.innerHTML = this.responseText;}
243 |           if (this.status == 404) {elmnt.innerHTML = "Page not found.";}
244 |           elmnt.removeAttribute("w3-include-html");
245 |           w3.includeHTML(cb);
246 |         }
247 |       }      
248 |       xhttp.open("GET", file, true);
249 |       xhttp.send();
250 |       return;
251 |     }
252 |   }
253 |   if (cb) cb();
254 | };
255 | w3.getHttpData = function (file, func) {
256 |   w3.http(file, function () {
257 |     if (this.readyState == 4 && this.status == 200) {
258 |       func(this.responseText);
259 |     }
260 |   });
261 | };
262 | w3.getHttpObject = function (file, func) {
263 |   w3.http(file, function () {
264 |     if (this.readyState == 4 && this.status == 200) {
265 |       func(JSON.parse(this.responseText));
266 |     }
267 |   });
268 | };
269 | w3.displayHttp = function (id, file) {
270 |   w3.http(file, function () {
271 |     if (this.readyState == 4 && this.status == 200) {
272 |       w3.displayObject(id, JSON.parse(this.responseText));
273 |     }
274 |   });
275 | };
276 | w3.http = function (target, readyfunc, xml, method) {
277 |   var httpObj;
278 |   if (!method) {method = "GET"; }
279 |   if (window.XMLHttpRequest) {
280 |     httpObj = new XMLHttpRequest();
281 |   } else if (window.ActiveXObject) {
282 |     httpObj = new ActiveXObject("Microsoft.XMLHTTP");
283 |   }
284 |   if (httpObj) {
285 |     if (readyfunc) {httpObj.onreadystatechange = readyfunc;}
286 |     httpObj.open(method, target, true);
287 |     httpObj.send(xml);
288 |   }
289 | };
290 | w3.getElementsByAttribute = function (x, att) {
291 |   var arr = [], arrCount = -1, i, l, y = x.getElementsByTagName("*"), z = att.toUpperCase();
292 |   l = y.length;
293 |   for (i = -1; i < l; i += 1) {
294 |     if (i == -1) {y[i] = x;}
295 |     if (y[i].getAttribute(z) !== null) {arrCount += 1; arr[arrCount] = y[i];}
296 |   }
297 |   return arr;
298 | };  
299 | w3.dataObject = {},
300 | w3.displayObject = function (id, data) {
301 |   var htmlObj, htmlTemplate, html, arr = [], a, l, rowClone, x, j, i, ii, cc, repeat, repeatObj, repeatX = "";
302 |   htmlObj = document.getElementById(id);
303 |   htmlTemplate = init_template(id, htmlObj);
304 |   html = htmlTemplate.cloneNode(true);
305 |   arr = w3.getElementsByAttribute(html, "w3-repeat");
306 |   l = arr.length;
307 |   for (j = (l - 1); j >= 0; j -= 1) {
308 |     cc = arr[j].getAttribute("w3-repeat").split(" ");
309 |     if (cc.length == 1) {
310 |       repeat = cc[0];
311 |     } else {
312 |       repeatX = cc[0];
313 |       repeat = cc[2];
314 |     }
315 |     arr[j].removeAttribute("w3-repeat");
316 |     repeatObj = data[repeat];
317 |     if (repeatObj && typeof repeatObj == "object" && repeatObj.length != "undefined") {
318 |       i = 0;
319 |       for (x in repeatObj) {
320 |         i += 1;
321 |         rowClone = arr[j];
322 |         rowClone = w3_replace_curly(rowClone, "element", repeatX, repeatObj[x]);
323 |         a = rowClone.attributes;
324 |         for (ii = 0; ii < a.length; ii += 1) {
325 |           a[ii].value = w3_replace_curly(a[ii], "attribute", repeatX, repeatObj[x]).value;
326 |         }
327 |         (i === repeatObj.length) ? arr[j].parentNode.replaceChild(rowClone, arr[j]) : arr[j].parentNode.insertBefore(rowClone, arr[j]);
328 |       }
329 |     } else {
330 |       console.log("w3-repeat must be an array. " + repeat + " is not an array.");
331 |       continue;
332 |     }
333 |   }
334 |   html = w3_replace_curly(html, "element");
335 |   htmlObj.parentNode.replaceChild(html, htmlObj);
336 |   function init_template(id, obj) {
337 |     var template;
338 |     template = obj.cloneNode(true);
339 |     if (w3.dataObject.hasOwnProperty(id)) {return w3.dataObject[id];}
340 |     w3.dataObject[id] = template;
341 |     return template;
342 |   }
343 |   function w3_replace_curly(elmnt, typ, repeatX, x) {
344 |     var value, rowClone, pos1, pos2, originalHTML, lookFor, lookForARR = [], i, cc, r;
345 |     rowClone = elmnt.cloneNode(true);
346 |     pos1 = 0;
347 |     while (pos1 > -1) {
348 |       originalHTML = (typ == "attribute") ? rowClone.value : rowClone.innerHTML;
349 |       pos1 = originalHTML.indexOf("{{", pos1);
350 |       if (pos1 === -1) {break;}
351 |       pos2 = originalHTML.indexOf("}}", pos1 + 1);
352 |       lookFor = originalHTML.substring(pos1 + 2, pos2);
353 |       lookForARR = lookFor.split("||");
354 |       value = undefined;
355 |       for (i = 0; i < lookForARR.length; i += 1) {
356 |         lookForARR[i] = lookForARR[i].replace(/^\s+|\s+$/gm, ''); //trim
357 |         if (x) {value = x[lookForARR[i]];}
358 |         if (value == undefined && data) {value = data[lookForARR[i]];}
359 |         if (value == undefined) {
360 |           cc = lookForARR[i].split(".");
361 |           if (cc[0] == repeatX) {value = x[cc[1]]; }
362 |         }
363 |         if (value == undefined) {
364 |           if (lookForARR[i] == repeatX) {value = x;}
365 |         }
366 |         if (value == undefined) {
367 |           if (lookForARR[i].substr(0, 1) == '"') {
368 |             value = lookForARR[i].replace(/"/g, "");
369 |           } else if (lookForARR[i].substr(0,1) == "'") {
370 |             value = lookForARR[i].replace(/'/g, "");
371 |           }
372 |         }
373 |         if (value != undefined) {break;}
374 |       }
375 |       if (value != undefined) {
376 |         r = "{{" + lookFor + "}}";
377 |         if (typ == "attribute") {
378 |           rowClone.value = rowClone.value.replace(r, value);
379 |         } else {
380 |           w3_replace_html(rowClone, r, value);
381 |         }
382 |       }
383 |       pos1 = pos1 + 1;
384 |     }
385 |     return rowClone;
386 |   }
387 |   function w3_replace_html(a, r, result) {
388 |     var b, l, i, a, x, j;
389 |     if (a.hasAttributes()) {
390 |       b = a.attributes;
391 |       l = b.length;
392 |       for (i = 0; i < l; i += 1) {
393 |         if (b[i].value.indexOf(r) > -1) {b[i].value = b[i].value.replace(r, result);}
394 |       }
395 |     }
396 |     x = a.getElementsByTagName("*");
397 |     l = x.length;
398 |     a.innerHTML = a.innerHTML.replace(r, result);
399 |   }
400 | };


--------------------------------------------------------------------------------
/gbd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | # MIT License
  4 | 
  5 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  6 | 
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | 
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | 
 17 | import os
 18 | import sys
 19 | import traceback
 20 | import polars as pl
 21 | 
 22 | from gbd_core.api import GBD, GBDException
 23 | from gbd_core.grammar import ParserException
 24 | from gbd_core import util, contexts, schema
 25 | from gbd_core.util_argparse import *
 26 | from gbd_init.feature_extractors import generic_extractors
 27 | from gbd_init.instance_transformers import generic_transformers
 28 | 
 29 | 
 30 | ### Command-Line Interface Entry Points
 31 | def cli_hash(api: GBD, args):
 32 |     from gbd_core.contexts import identify
 33 | 
 34 |     print(identify(args.path))
 35 | 
 36 | 
 37 | def cli_init_local(api: GBD, args):
 38 |     from gbd_init.feature_extractors import init_local
 39 | 
 40 |     rlimits = {"jobs": args.jobs, "tlim": args.tlim, "mlim": args.mlim, "flim": args.flim}
 41 |     init_local(api, rlimits, args.path, args.target)
 42 | 
 43 | 
 44 | def cli_init_generic(api: GBD, args):
 45 |     from gbd_init.feature_extractors import init_features_generic
 46 | 
 47 |     rlimits = {"jobs": args.jobs, "tlim": args.tlim, "mlim": args.mlim, "flim": args.flim}
 48 |     context = api.database.dcontext(args.target)
 49 |     df = api.query(args.query, args.hashes, [context + ":local"], collapse="MIN", group_by=context + ":hash")
 50 |     init_features_generic(args.initfuncname, api, rlimits, df, args.target)
 51 | 
 52 | 
 53 | def cli_trans_generic(api: GBD, args):
 54 |     from gbd_init.instance_transformers import transform_instances_generic
 55 | 
 56 |     rlimits = {"jobs": args.jobs, "tlim": args.tlim, "mlim": args.mlim, "flim": args.flim}
 57 |     transform_instances_generic(args.transfuncname, api, rlimits, args.query, args.hashes, args.target, args.source, args.collapse)
 58 | 
 59 | 
 60 | def cli_create(api: GBD, args):
 61 |     api.create_feature(args.name, args.unique, args.target)
 62 | 
 63 | 
 64 | def cli_delete(api: GBD, args):
 65 |     if (args.hashes and len(args.hashes) or args.values and len(args.values)) and args.name:
 66 |         if args.force or util.confirm("Delete attributes of given hashes and/or values from '{}'?".format(args.name)):
 67 |             api.reset_values(args.name, args.values, args.hashes)
 68 |     elif args.hashes and not args.name:
 69 |         if args.force or util.confirm("Delete given hashes entirely?".format(args.name)):
 70 |             api.delete_hashes(args.hashes)
 71 |     elif args.force or util.confirm("Delete feature '{}' and all associated attributes?".format(args.name)):
 72 |         api.delete_feature(args.name)
 73 | 
 74 | 
 75 | def cli_cleanup(api: GBD, args):
 76 |     if args.hashes and len(args.hashes):
 77 |         if args.force or util.confirm("Delete attributes of given hashes from all features?"):
 78 |             api.delete_hashes(args.hashes, args.target)
 79 | 
 80 | 
 81 | def cli_rename(api: GBD, args):
 82 |     api.rename_feature(args.old_name, args.new_name)
 83 | 
 84 | 
 85 | def cli_copy(api: GBD, args):
 86 |     api.copy_feature(args.old_name, args.new_name, args.target, args.query, args.hashes)
 87 | 
 88 | 
 89 | def cli_get(api: GBD, args):
 90 |     df: pl.DataFrame = api.query(args.query, args.hashes, args.resolve, args.collapse, args.group_by, args.join_type)
 91 |     if args.header:
 92 |         print(args.delimiter.join(df.columns))
 93 |     for row in df.iter_rows(named=True):
 94 |         print(args.delimiter.join([str(row[col]) if row[col] is not None else "[None]" for col in df.columns]))
 95 | 
 96 | 
 97 | def cli_set(api: GBD, args):
 98 |     hashes = api.query(args.query, args.hashes)["hash"].to_list()
 99 |     if args.create:
100 |         hashes = list(set(hashes + args.hashes))
101 |     if len(hashes) > 0:
102 |         api.set_values(args.assign[0], args.assign[1], hashes)
103 | 
104 | 
105 | def cli_info(api: GBD, args):
106 |     if args.contexts:
107 |         print("# Available Contexts: " + ", ".join(contexts.contexts()))
108 |         for context in contexts.contexts():
109 |             print()
110 |             print("## " + contexts.description(context))
111 |             print(" - Context Prefix: " + context)
112 |             print(" - File Extensions: " + ",".join(contexts.suffixes(context)))
113 |     elif args.name is None:
114 |         print("# Available Data Sources: " + ", ".join(api.get_databases()))
115 |         for dbname in api.get_databases():
116 |             if len(api.get_features(dbname)):
117 |                 print()
118 |                 print("## " + api.get_database_path(dbname))
119 |                 print(" - Name: " + dbname)
120 |                 feat = api.get_features(dbname)
121 |                 print(" - Features: " + " ".join(feat))
122 |                 if args.verbose:
123 |                     for f in feat:
124 |                         info = api.database.find(":".join([dbname, f]))
125 |                         print(info)
126 |     else:
127 |         info = api.get_feature_info(args.name)
128 |         for key in info:
129 |             print("{}: {}".format(key, info[key]))
130 | 
131 | 
132 | def cli_server(api: GBD, args):
133 |     from gbd_server import server
134 | 
135 |     util.eprint("Starting GBD Server on port {}...".format(args.port))
136 |     util.eprint(r"""
137 | Warning: All files referenced in the configured databases are now accessible on the specified port.
138 | If you do not trust the source of the databases, do not run the server.
139 | """)
140 |     server.serve(api, args.port, args.logdir)
141 | 
142 | 
143 | ### Define Command-Line Interface and Map Sub-Commands to Methods
144 | def main():
145 |     parser = get_gbd_argparser()
146 | 
147 |     subparsers = parser.add_subparsers(help="Available Commands:", required=True, dest="gbd command")
148 | 
149 |     # INITIALIZATION
150 |     parser_init = subparsers.add_parser("init", help="Initialize Database")
151 |     add_resource_limits_arguments(parser_init)
152 |     parser_init.add_argument("--target", help="Target database for new features (default: first db in list); also determines target context", default=None)
153 | 
154 |     parser_init_subparsers = parser_init.add_subparsers(help="Select Initialization Procedure:", required=True, dest="init what?")
155 | 
156 |     # init local paths:
157 |     parser_init_local = parser_init_subparsers.add_parser("local", help="Initialize Local Hash/Path Entries")
158 |     parser_init_local.add_argument("path", type=directory_type, help="Path to benchmarks")
159 |     parser_init_local.set_defaults(func=cli_init_local)
160 | 
161 |     # hooks for generic feature extractors:
162 |     for key in generic_extractors.keys():
163 |         gex = generic_extractors[key]
164 |         parser_init_generic = parser_init_subparsers.add_parser(key, help=gex["description"])
165 |         add_query_and_hashes_arguments(parser_init_generic)
166 |         parser_init_generic.set_defaults(func=cli_init_generic, initfuncname=key)
167 | 
168 |     # TRANSFORMATION
169 |     parser_trans = subparsers.add_parser("transform", help="Transform Benchmarks")
170 |     add_resource_limits_arguments(parser_trans)
171 |     parser_trans.add_argument("--source", help="Source context", default=contexts.default_context())
172 |     parser_trans.add_argument("--target", help="Target database; determines target context (default: first db in list)", default=None)
173 | 
174 |     parser_trans_subparsers = parser_trans.add_subparsers(help="Select Transformation Procedure:", required=True, dest="transform how?")
175 | 
176 |     # hooks for generic instance transformers:
177 |     for key in generic_transformers.keys():
178 |         gex = generic_transformers[key]
179 |         parser_trans_generic = parser_trans_subparsers.add_parser(key, help=gex["description"])
180 |         add_query_and_hashes_arguments(parser_trans_generic)
181 |         parser_trans_generic.set_defaults(func=cli_trans_generic, transfuncname=key)
182 |         parser_trans_generic.add_argument(
183 |             "-c",
184 |             "--collapse",
185 |             default="group_concat",
186 |             choices=["group_concat", "min", "max", "avg", "count", "sum", "none"],
187 |             help="Specify a function for the handling of multiple feature values",
188 |         )
189 | 
190 |     # GBD HASH
191 |     parser_hash = subparsers.add_parser("hash", help="Print hash for a single file")
192 |     parser_hash.add_argument("path", type=file_type, help="Path to one benchmark")
193 |     parser_hash.set_defaults(func=cli_hash)
194 | 
195 |     # GBD GET $QUERY
196 |     parser_get = subparsers.add_parser("get", help="Get data by query (or hash-list via stdin)")
197 |     add_query_and_hashes_arguments(parser_get)
198 |     parser_get.add_argument("-r", "--resolve", help="List of feature names to resolve against", nargs="+", default=[])
199 |     parser_get.add_argument(
200 |         "-c",
201 |         "--collapse",
202 |         default="group_concat",
203 |         choices=["group_concat", "min", "max", "avg", "count", "sum", "none"],
204 |         help="Specify a function for the handling of multiple feature values",
205 |     )
206 |     parser_get.add_argument("-g", "--group_by", default=None, help="Group by the specified feature as the key, rather than by the primary key")
207 |     parser_get.add_argument("--join-type", help="Join Type: treatment of missing values", choices=["INNER", "OUTER", "LEFT"], default="LEFT")
208 |     parser_get.add_argument("-d", "--delimiter", default=" ", help="CSV delimiter to use in output")
209 |     parser_get.add_argument("-H", "--header", action="store_true", help="Include header information in output")
210 |     parser_get.set_defaults(func=cli_get)
211 | 
212 |     # GBD SET
213 |     parser_set = subparsers.add_parser("set", help="Set specified attribute-value for query result")
214 |     parser_set.add_argument("assign", type=key_value_type, help="key=value")
215 |     parser_set.add_argument(
216 |         "-c", "--create", help="Create given hashes if they do not exist yet (otherwise intersect with existing hashes)", action="store_true"
217 |     )
218 |     add_query_and_hashes_arguments(parser_set)
219 |     parser_set.set_defaults(func=cli_set)
220 | 
221 |     # CREATE/DELETE/MODIFY FEATURES
222 |     parser_create = subparsers.add_parser("create", help="Create a new feature")
223 |     parser_create.add_argument("name", type=column_type, help="Name of feature")
224 |     parser_create.add_argument("-u", "--unique", help="Unique constraint: specify default-value of feature")
225 |     parser_create.add_argument("--target", help="Target database (default: first in list)", default=None)
226 |     parser_create.set_defaults(func=cli_create)
227 | 
228 |     parser_delete = subparsers.add_parser(
229 |         "delete", help="Delete all values assiociated with given hashes (via argument or stdin) or remove feature if no hashes are given"
230 |     )
231 |     parser_delete.add_argument("--hashes", help="Hashes for which to delete values", nargs="*", default=[])
232 |     parser_delete.add_argument("--values", help="Values to delete", nargs="*", default=[])
233 |     parser_delete.add_argument("name", type=column_type, help="Name of feature (default: all)", nargs="?")
234 |     parser_delete.add_argument("-f", "--force", action="store_true", help="Do not ask for confirmation")
235 |     parser_delete.set_defaults(func=cli_delete)
236 | 
237 |     parser_cleanup = subparsers.add_parser("cleanup", help="Delete given hashes from all features")
238 |     parser_cleanup.add_argument("--hashes", help="Hashes for which to delete values", nargs="*", default=[])
239 |     parser_cleanup.add_argument("-f", "--force", action="store_true", help="Do not ask for confirmation")
240 |     parser_cleanup.add_argument("--target", help="Target database (default: first in list)", default=None)
241 |     parser_cleanup.set_defaults(func=cli_cleanup)
242 | 
243 |     parser_rename = subparsers.add_parser("rename", help="Rename feature")
244 |     parser_rename.add_argument("old_name", type=column_type, help="Old name of feature")
245 |     parser_rename.add_argument("new_name", type=column_type, help="New name of feature")
246 |     parser_rename.set_defaults(func=cli_rename)
247 | 
248 |     parser_copy = subparsers.add_parser("copy", help="Copy feature")
249 |     add_query_and_hashes_arguments(parser_copy)
250 |     parser_copy.add_argument("--target", help="Target database (default: first in list)", default=None)
251 |     parser_copy.add_argument("old_name", type=column_type, help="Old name of feature")
252 |     parser_copy.add_argument("new_name", type=column_type, help="New name of feature")
253 |     parser_copy.set_defaults(func=cli_copy)
254 | 
255 |     # GET META INFO
256 |     parser_info = subparsers.add_parser("info", help="Print info about available features")
257 |     parser_info.add_argument("-c", "--contexts", action="store_true", help="Print available contexts")
258 |     parser_info.add_argument("name", type=column_type, help="Print info about specified feature", nargs="?")
259 |     parser_info.set_defaults(func=cli_info)
260 | 
261 |     # RUN SERVER
262 |     parser_server = subparsers.add_parser("serve", help="Run GBD Server")
263 |     parser_server.add_argument("-p", "--port", help="Specify port on which to listen", default=os.environ.get("GBD_PORT") or 5000, type=int)
264 |     parser_server.add_argument("-l", "--logdir", help="Specify directory for logfiles", default=os.environ.get("GBD_LOGS") or "./")
265 |     parser_server.set_defaults(func=cli_server)
266 | 
267 |     # PARSE ARGUMENTS
268 |     args = parser.parse_args()
269 |     try:
270 |         if hasattr(args, "hashes") and not sys.stdin.isatty():
271 |             if not args.hashes or len(args.hashes) == 0:
272 |                 args.hashes = util.read_hashes()  # read hashes from stdin
273 |         if hasattr(args, "target") and args.target is None:
274 |             args.target = schema.Schema.dbname_from_path(args.db.split(os.pathsep)[0])
275 |         if args.db is None or len(args.db) == 0:
276 |             util.eprint("No database specified. Use -d or set GBD_DB environment variable.")
277 |             sys.exit(1)
278 |         with GBD(args.db.split(os.pathsep), args.verbose) as api:
279 |             args.func(api, args)
280 |     except ModuleNotFoundError as e:
281 |         util.eprint("Module '{}' not found. Please install it.".format(e.name))
282 |         if e.name == "gbdc":
283 |             util.eprint("Find installation instructions at https://github.com/Udopia/gbdc")
284 |         sys.exit(1)
285 |     except ParserException as e:
286 |         util.eprint("Failed to parse query: " + args.query)
287 |         if args.verbose:
288 |             util.eprint(traceback.format_exc())
289 |         sys.exit(1)
290 |     except pl.exceptions.DataOrientationWarning as e:
291 |         util.eprint(traceback.format_exc())
292 |     except Exception as e:
293 |         util.eprint("{}: {}".format(type(e), str(e)))
294 |         if args.verbose:
295 |             util.eprint(traceback.format_exc())
296 |         sys.exit(1)
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     main()
301 | 


--------------------------------------------------------------------------------
/gbd_core/schema.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2025 Ashlin Iser, Karlsruhe Institute of Technology (KIT)
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | import sqlite3
 16 | import typing
 17 | import os
 18 | import csv
 19 | import re
 20 | 
 21 | from dataclasses import dataclass
 22 | 
 23 | from gbd_core import contexts
 24 | from gbd_core.util import eprint, confirm
 25 | 
 26 | 
 27 | class SchemaException(Exception):
 28 |     pass
 29 | 
 30 | 
 31 | @dataclass
 32 | class FeatureInfo:
 33 |     name: str = None
 34 |     database: str = None
 35 |     table: str = None
 36 |     column: str = None
 37 |     default: str = None
 38 | 
 39 | 
 40 | class Schema:
 41 |     def __init__(self, dbcon, dbname, path, features, context, csv=False):
 42 |         self.dbname = dbname
 43 |         self.path = path
 44 |         self.features = features
 45 |         self.context = context
 46 |         self.dbcon = dbcon
 47 |         self.csv = csv
 48 | 
 49 |     @classmethod
 50 |     def is_database(cls, path):
 51 |         if os.path.isfile(path):
 52 |             sz = os.path.getsize(path)
 53 |             if sz == 0:
 54 |                 return True  # new sqlite3 files can be empty
 55 |             if sz < 100:
 56 |                 return False  # sqlite header is 100 bytes
 57 |             with open(path, "rb") as fd:
 58 |                 header = fd.read(100)  # validate header
 59 |             return header[:16] == b"SQLite format 3\x00"
 60 |         elif confirm("Database '{}' does not exist. Create new database?".format(path)):
 61 |             sqlite3.connect(path).close()
 62 |             return True
 63 |         else:
 64 |             raise SchemaException("Database '{}' does not exist".format(path))
 65 | 
 66 |     @classmethod
 67 |     def create(cls, path):
 68 |         try:
 69 |             if cls.is_database(path):
 70 |                 return cls.from_database(path)
 71 |             else:
 72 |                 return cls.from_csv(path)
 73 |         except Exception as e:
 74 |             raise SchemaException(str(e))
 75 | 
 76 |     @classmethod
 77 |     def from_database(cls, path):
 78 |         dbname = cls.dbname_from_path(path)
 79 |         con = sqlite3.connect(path)
 80 |         features = cls.features_from_database(dbname, path, con)
 81 |         context = cls.context_from_database(dbname)
 82 |         return cls(con, dbname, path, features, context)
 83 | 
 84 |     @classmethod
 85 |     def from_csv(cls, path):
 86 |         dbname = cls.dbname_from_path(path)
 87 |         con = sqlite3.connect("file:{}?mode=memory&cache=shared".format(dbname), uri=True)
 88 |         features = cls.features_from_csv(dbname, path, con)
 89 |         context = cls.context_from_csv(dbname)
 90 |         return cls(con, dbname, path, features, context, True)
 91 | 
 92 |     # Import CSV to in-memory db, create according schema info
 93 |     @classmethod
 94 |     def features_from_csv(cls, dbname, path, con) -> typing.Dict[str, FeatureInfo]:
 95 |         features = dict()
 96 |         with open(path) as csvfile:
 97 |             temp_lines = csvfile.readline() + "\n" + csvfile.readline()
 98 |             dialect = csv.Sniffer().sniff(temp_lines, delimiters=";, \t")
 99 |             csvfile.seek(0)
100 |             csvreader = csv.DictReader(csvfile, dialect=dialect)
101 |             if "hash" in csvreader.fieldnames:
102 |                 cols = [re.sub("[^0-9a-zA-Z]+", "_", n) for n in csvreader.fieldnames]
103 |                 for colname in cols:
104 |                     features[colname] = FeatureInfo(colname, dbname, "features", colname, None)
105 |                 con.execute("CREATE TABLE IF NOT EXISTS {} ({})".format("features", ", ".join(cols)))
106 |                 for row in csvreader:
107 |                     con.execute("INSERT INTO {} VALUES ('{}')".format("features", "', '".join(row.values())))
108 |                 con.commit()
109 |             else:
110 |                 raise SchemaException("Column 'hash' not found in {}".format(csvfile))
111 |         return features
112 | 
113 |     # Create schema info for sqlite database
114 |     @classmethod
115 |     def features_from_database(cls, dbname, path, con) -> typing.Dict[str, FeatureInfo]:
116 |         features = dict()
117 |         sql_tables = "SELECT tbl_name FROM sqlite_master WHERE type = 'table'"
118 |         tables = [tab for (tab,) in con.execute(sql_tables).fetchall() if not tab.startswith("_")]
119 |         for table in tables:
120 |             columns = con.execute("PRAGMA table_info({})".format(table)).fetchall()
121 |             for index, colname, coltype, notnull, default_value, pk in columns:
122 |                 is_fk_column = table == "features" and colname in tables
123 |                 is_fk_hash = table != "features" and colname == "hash"
124 |                 if not is_fk_column and not is_fk_hash:
125 |                     fname = colname if table == "features" else table
126 |                     dval = default_value.strip('"') if default_value else None
127 |                     features[fname] = FeatureInfo(fname, dbname, table, colname, dval)
128 |         return features
129 | 
130 |     @classmethod
131 |     def context_from_csv(cls, path):
132 |         return cls.context_from_name(Schema.dbname_from_path(path))
133 | 
134 |     @classmethod
135 |     def context_from_database(cls, path):
136 |         # TODO: store context in database
137 |         return cls.context_from_name(Schema.dbname_from_path(path))
138 | 
139 |     @classmethod
140 |     def context_from_name(cls, name):
141 |         pair = name.split("_")
142 |         if len(pair) > 1 and pair[0] in contexts.contexts():
143 |             return pair[0]
144 |         else:
145 |             return contexts.default_context()
146 | 
147 |     @classmethod
148 |     def dbname_from_path(cls, path):
149 |         filename = os.path.splitext(os.path.basename(path))[0]
150 |         if filename[0].isdigit():
151 |             filename = contexts.default_context() + "_" + filename
152 |         return re.sub("[^a-zA-Z0-9]", "_", filename)
153 | 
154 |     @classmethod
155 |     def valid_feature_or_raise(cls, name):
156 |         if not re.match("[a-zA-Z][a-zA-Z0-9_]*", name):
157 |             raise SchemaException("Feature name '{}' must be alphanumeric (incl. underline) and start with a letter.".format(name))
158 |         # gbd_keywords = [ 'hash', 'value', 'local', 'filename', 'features' ]
159 |         gbd_keywords = ["hash", "value", "features"]
160 |         if name.lower() in gbd_keywords:
161 |             raise SchemaException("Feature name '{}' is reserved.".format(name))
162 |         sqlite_keywords = [
163 |             "abort",
164 |             "action",
165 |             "add",
166 |             "after",
167 |             "all",
168 |             "alter",
169 |             "always",
170 |             "analyze",
171 |             "and",
172 |             "as",
173 |             "asc",
174 |             "attach",
175 |             "autoincrement",
176 |             "before",
177 |             "begin",
178 |             "between",
179 |             "by",
180 |             "cascade",
181 |             "case",
182 |             "cast",
183 |             "check",
184 |             "collate",
185 |             "column",
186 |             "commit",
187 |             "conflict",
188 |             "constraint",
189 |             "create",
190 |             "cross",
191 |             "current",
192 |             "current_date",
193 |             "current_time",
194 |             "current_timestamp",
195 |             "database",
196 |             "default",
197 |             "deferrable",
198 |             "deferred",
199 |             "delete",
200 |             "desc",
201 |             "detach",
202 |             "distinct",
203 |             "do",
204 |             "drop",
205 |             "each",
206 |             "else",
207 |             "end",
208 |             "escape",
209 |             "except",
210 |             "exclude",
211 |             "exclusive",
212 |             "exists",
213 |             "explain",
214 |             "fail",
215 |             "filter",
216 |             "first",
217 |             "following",
218 |             "for",
219 |             "foreign",
220 |             "from",
221 |             "full",
222 |             "generated",
223 |             "glob",
224 |             "group",
225 |             "groups",
226 |             "having",
227 |             "if",
228 |             "ignore",
229 |             "immediate",
230 |             "in",
231 |             "index",
232 |             "indexed",
233 |             "initially",
234 |             "inner",
235 |             "insert",
236 |             "instead",
237 |             "intersect",
238 |             "into",
239 |             "is",
240 |             "isnull",
241 |             "join",
242 |             "key",
243 |             "last",
244 |             "left",
245 |             "like",
246 |             "limit",
247 |             "match",
248 |             "materialized",
249 |             "natural",
250 |             "no",
251 |             "not",
252 |             "nothing",
253 |             "notnull",
254 |             "null",
255 |             "nulls",
256 |             "of",
257 |             "offset",
258 |             "on",
259 |             "or",
260 |             "order",
261 |             "others",
262 |             "outer",
263 |             "over",
264 |             "partition",
265 |             "plan",
266 |             "pragma",
267 |             "preceding",
268 |             "primary",
269 |             "query",
270 |             "raise",
271 |             "range",
272 |             "recursive",
273 |             "references",
274 |             "regexp",
275 |             "reindex",
276 |             "release",
277 |             "rename",
278 |             "replace",
279 |             "restrict",
280 |             "returning",
281 |             "right",
282 |             "rollback",
283 |             "row",
284 |             "rows",
285 |             "savepoint",
286 |             "select",
287 |             "set",
288 |             "table",
289 |             "temp",
290 |             "temporary",
291 |             "then",
292 |             "ties",
293 |             "to",
294 |             "transaction",
295 |             "trigger",
296 |             "unbounded",
297 |             "union",
298 |             "unique",
299 |             "update",
300 |             "using",
301 |             "vacuum",
302 |             "values",
303 |             "view",
304 |             "virtual",
305 |             "when",
306 |             "where",
307 |             "window",
308 |             "with",
309 |             "without",
310 |         ]
311 |         if name.lower() in sqlite_keywords or name.startswith("sqlite_"):
312 |             raise SchemaException("Feature name '{}' is reserved by sqlite.".format(name))
313 | 
314 |     def is_in_memory(self):
315 |         return self.csv
316 | 
317 |     def get_connection(self):
318 |         if self.is_in_memory():
319 |             return sqlite3.connect("file::memory:?cache=shared", uri=True)
320 |         else:
321 |             return sqlite3.connect(self.path)
322 | 
323 |     def execute(self, sql):
324 |         con = self.get_connection()
325 |         cur = con.cursor()
326 |         cur.execute(sql)
327 |         con.commit()
328 |         con.close()
329 | 
330 |     def get_tables(self):
331 |         return list(set([f.table for f in self.get_features()]))
332 | 
333 |     def get_features(self):
334 |         return self.features.values()
335 | 
336 |     def has_feature(self, name):
337 |         return name in self.features.keys()
338 | 
339 |     def absorb(self, schema):
340 |         if self.is_in_memory() and schema.is_in_memory():
341 |             self.features.update(schema.features)
342 |         else:
343 |             raise SchemaException("Internal Error: Attempt to merge non-virtual schemata")
344 | 
345 |     def create_main_table_if_not_exists(self):
346 |         main_table = "features"
347 |         if not main_table in self.get_tables():
348 |             self.execute("CREATE TABLE IF NOT EXISTS {} (hash UNIQUE NOT NULL)".format(main_table))
349 |             # insert all known hashes into main table and create triggers
350 |             for table in [t for t in self.get_tables() if t != main_table]:
351 |                 self.execute("INSERT OR IGNORE INTO {} (hash) SELECT DISTINCT(hash) FROM {}".format(main_table, table))
352 |                 self.execute(
353 |                     """CREATE TRIGGER IF NOT EXISTS {}_dval AFTER INSERT ON {} 
354 |                                             BEGIN INSERT OR IGNORE INTO {} (hash) VALUES (NEW.hash); END""".format(table, table, main_table)
355 |                 )
356 |             self.features["hash"] = FeatureInfo("hash", self.dbname, main_table, "hash", None)
357 |             return [self.features["hash"]]
358 |         else:
359 |             return []
360 | 
361 |     def create_feature(self, name, default_value=None, permissive=False):
362 |         if not permissive:  # internal use can be unchecked, e.g., to create the reserved features during initialization
363 |             Schema.valid_feature_or_raise(name)
364 | 
365 |         created = []
366 | 
367 |         if not self.has_feature(name):
368 |             # ensure existence of main table:
369 |             created.extend(self.create_main_table_if_not_exists())
370 | 
371 |             # create new feature:
372 |             main_table = "features"
373 |             self.execute("ALTER TABLE {} ADD {} TEXT NOT NULL DEFAULT {}".format(main_table, name, default_value or "None"))
374 |             if default_value is not None:
375 |                 # feature is unique and resides in main features-table:
376 |                 self.features[name] = FeatureInfo(name, self.dbname, main_table, name, default_value)
377 |             else:
378 |                 # feature is not unique and resides in a separate table (column in main features-table is a foreign key):
379 |                 self.execute("CREATE TABLE IF NOT EXISTS {} (hash TEXT NOT NULL, value TEXT NOT NULL, CONSTRAINT all_unique UNIQUE(hash, value))".format(name))
380 |                 self.execute("INSERT INTO {} (hash, value) VALUES ('None', 'None')".format(name))
381 |                 self.execute(
382 |                     """CREATE TRIGGER IF NOT EXISTS {}_hash AFTER INSERT ON {}
383 |                                     BEGIN INSERT OR IGNORE INTO {} (hash) VALUES (NEW.hash); END""".format(name, name, main_table)
384 |                 )
385 |                 self.features[name] = FeatureInfo(name, self.dbname, name, "value", None)
386 | 
387 |             # update schema:
388 |             created.append(self.features[name])
389 | 
390 |         elif not permissive:
391 |             raise SchemaException("Feature '{}' already exists".format(name))
392 | 
393 |         return created
394 | 
395 |     def set_values(self, feature, value, hashes):
396 |         if not self.has_feature(feature):
397 |             raise SchemaException("Feature '{}' does not exist".format(feature))
398 |         if not len(hashes):
399 |             raise SchemaException("No hashes given")
400 |         table = self.features[feature].table
401 |         column = self.features[feature].column
402 |         values = ", ".join(["('{}', '{}')".format(hash, value) for hash in hashes])
403 |         if self.features[feature].default is None:
404 |             self.execute("INSERT OR IGNORE INTO {tab} (hash, {col}) VALUES {vals}".format(tab=table, col=column, vals=values))
405 |             self.execute("UPDATE features SET {col}=hash WHERE hash in ('{h}')".format(col=table, h="', '".join(hashes)))
406 |         else:
407 |             self.execute(
408 |                 "INSERT INTO {tab} (hash, {col}) VALUES {vals} ON CONFLICT (hash) DO UPDATE SET {col}='{val}' WHERE hash in ('{h}')".format(
409 |                     tab=table, col=column, val=value, vals=values, h="', '".join(hashes)
410 |                 )
411 |             )
412 | 


--------------------------------------------------------------------------------