├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── blank-issue.md │ ├── bug-report.md │ ├── feature-request.md │ └── question.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── cffconvert.yml │ ├── python-publish.yml │ └── pythonapp.yml ├── .gitignore ├── CHANGELOG.md ├── CITATION.cff ├── LICENSE ├── README.md ├── config └── example.yaml ├── docs ├── README.md ├── figures.py ├── guides │ └── tokens.md ├── issue-thread.png ├── pdfs │ ├── git_commit_model_add.pdf │ ├── git_commit_model_del.pdf │ ├── git_commit_model_mod.pdf │ ├── gitlab_commit_model.pdf │ ├── gitlab_issue_model.pdf │ ├── gitlab_merge_request_model.pdf │ └── gitlab_release_tag_model.pdf ├── requirements.txt └── svgs │ ├── git_commit_model_add.svg │ ├── git_commit_model_del.svg │ ├── git_commit_model_mod.svg │ ├── gitlab_commit_model.svg │ ├── gitlab_issue_model.svg │ ├── gitlab_merge_request_model.svg │ └── gitlab_release_tag_model.svg ├── gitlab2prov ├── __init__.py ├── adapters │ ├── __init__.py │ ├── fetch │ │ ├── __init__.py │ │ ├── annotations │ │ │ ├── __init__.py │ │ │ ├── classifiers.py │ │ │ └── parse.py │ │ ├── git.py │ │ ├── gitlab.py │ │ └── utils.py │ └── repository.py ├── bootstrap.py ├── config │ ├── __init__.py │ ├── parser.py │ └── schema.json ├── domain │ ├── __init__.py │ ├── commands.py │ ├── constants.py │ └── objects.py ├── entrypoints │ ├── __init__.py │ └── cli.py ├── log.py ├── prov │ ├── __init__.py │ ├── model.py │ └── operations.py ├── root.py └── service_layer │ ├── __init__.py │ ├── handlers.py │ ├── messagebus.py │ └── unit_of_work.py ├── pyproject.toml └── tests ├── __init__.py ├── e2e └── __init__.py ├── integration ├── __init__.py └── test_repository.py ├── random_refs.py └── unit ├── __init__.py ├── test_annotation_parsing.py ├── test_classifiers.py ├── test_fetch_utils.py ├── test_handlers.py ├── test_objects.py └── test_operations.py /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | 4 | ## Styleguides 5 | ### Commit messages 6 | 7 | Read the two excellent documents on commit messages 8 | * [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/) by [Chris Beams](https://chris.beams.io/) 9 | * [Commit Message Guidelines](https://gist.github.com/robertpainsi/b632364184e70900af4ab688decf6f53) by [robertpainsi](https://gist.github.com/robertpainsi) 10 | 11 | > "If applied, this commit will *\*" 12 | 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Blank issue 📄" 3 | about: Create a blank issue. 4 | title: 5 | labels: 6 | assignees: cdboer 7 | 8 | --- -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Bug report \U0001F41B" 3 | about: Create a report to help us improve. 4 | title: 5 | labels: bug 6 | assignees: cdboer 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Use the following config `...` 16 | 2. Run `gitlab2prov` 17 | 3. See error 18 | 19 | **Expected behavior** 20 | A clear and concise description of what you expected to happen. 21 | 22 | **Additional context** 23 | Add any other context about the problem here. 24 | 25 | **Installation details** 26 | Add context about your system and `gitlab2prov` installation. 27 | Version: `gitlab2prov --version` 28 | Operating system: Windows | Linux | macOS 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request ✨ 3 | about: Suggest an idea for this project. 4 | title: 5 | labels: feature 6 | assignees: cdboer 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question ❓ 3 | about: Ask a question about the project. 4 | title: 5 | labels: question 6 | assignees: cdboer 7 | 8 | --- 9 | 10 | **Your Question** 11 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Summary 📝 2 | Fixes # 3 | 4 | ### Proposed Changes 👷 5 | - 6 | - 7 | - 8 | 9 | ### Type of Change 🏷️ 10 | - [ ] Bug fix (non breaking change which fixes an issue) 11 | - [ ] New feature (non breaking change which adds functionality) 12 | - [ ] Breaking change (fix or feature that could cause existing functionality to not work as expected) 13 | 14 | ### Checklist ✅ 15 | - [ ] I have included tests, if necessary 16 | - [ ] I have updated documentation, if necessary 17 | -------------------------------------------------------------------------------- /.github/workflows/cffconvert.yml: -------------------------------------------------------------------------------- 1 | name: cffconvert 2 | 3 | on: 4 | push: 5 | paths: 6 | - CITATION.cff 7 | 8 | jobs: 9 | validate: 10 | name: "validate" 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Check out a copy of the repository 14 | uses: actions/checkout@v3 15 | 16 | - name: Check whether the citation metadata from CITATION.cff is valid 17 | uses: citation-file-format/cffconvert-github-action@2.0.0 18 | with: 19 | args: "--validate" -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.10' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install build twine 25 | - name: Build package 26 | run: python -m build 27 | - name: Publish package 28 | env: 29 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 30 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 31 | run: twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | name: Python Application Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 3.10 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: "3.10" 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install .[dev] 20 | - name: Test with pytest 21 | run: pytest -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .vscode 3 | .DS_Store 4 | dist/ 5 | __pycache__/ 6 | .mypy_cache/ 7 | *.egg-info 8 | *.code-workspace 9 | config/ 10 | prov.* 11 | *.provn 12 | build 13 | notebooks/.ipynb_checkpoints 14 | .ipynb_checkpoints 15 | .idea 16 | *.log 17 | *.db 18 | .pytest_cache 19 | venv 20 | *.env -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) 7 | 8 | ## [Unreleased] 9 | 10 | ## [2.2.0] - 2023-09-05 11 | ### Changed 12 | - Switched from `bump2version` to the maintained fork `bump-my-version`. See #95 for more details. Thanks to [@MattF-NSIDC](https://github.com/MattF-NSIDC) for the suggestion! 13 | 14 | ### Removed 15 | - Removed .bumpversion.cfg in favor of configuring `bump-my-version` in `pyproject.toml`. See #95 for more details. 16 | 17 | ### Fixed 18 | - Fixed a bug in the file version history parser that would lead to a `IndexError` when multiple changes were made to files in a single commit. See #102 for more details. 19 | 20 | ## [2.1.0] - 2023-04-17 21 | 22 | ### Added 23 | - CHANGELOG.md file to document all notable changes to this project. 24 | - README.md section on how to get started with gitlab2prov. 25 | - README.md note that gitlab2prov requires a git installation. 26 | - README.md section on dependencies and their respective licenses. See #91 for more details. 27 | - pyproject.toml comments documenting the license information of all dependencies. See #91 for more details. 28 | - Project documentation on how to obtain a GitLab API token. 29 | 30 | ### Changed 31 | - Tool will check for a git installation and exit with an error message if none is found. See #93 for more details. 32 | 33 | ### Fixed 34 | - Click package is now listed as a dependency in pyproject.toml. Thank you [@daniel-mohr](https://github.com/daniel-mohr) for reporting this oversight. 35 | 36 | 37 | ## [2.0.0] - 2022-10-31 38 | ### Added 39 | - New command line interface composed of multiple subcommands that can be chained together to build any desired gitlab2prov pipeline. 40 | - New YAML configuration file format to specify gitlab2prov pipelines in textual format and save them for later reruns or simple on-the-fly edits. 41 | - JSON-Schema file to check the config file for syntactical errors. 42 | - `bumpversion` support to change the version number with a single command. 43 | 44 | ### Changed 45 | - New build system that minimizes the number of necessary metadata files. 46 | - New CLI replaces the old CLI. 47 | - New config file format replaces the old config file format. 48 | - Deprecate setup.py, setup.cfg in favor of pyproject.toml. See #70 for more details. 49 | 50 | ### Fixed 51 | - Tool runs on Windows should no longer trigger WinError 5/267. See #67. 52 | - Special characters such as emojis no longer trigger a UnicodeEncodeError when serializing files. See #66. 53 | 54 | 55 | ## [1.1.4] - 2022-07-18 56 | 57 | ### Added 58 | - A GitHub workflow to validate the citation file on changes. 59 | 60 | ### Changed 61 | - Current version and release date in citation file. 62 | 63 | ### Fixed 64 | - Update the start- and endpoint of relations during pseudonymization. 65 | - User details and agent IDs are pseudonymized using hashing for flexible pseudonymization and merging of provenance graphs (#62). 66 | 67 | 68 | ## [1.1.3] - 2022-07-03 69 | 70 | ### Fixed 71 | - FileRevision objects that do not have a previous revision no longer lead to a TypeError when serializing the modification model. (see #64) 72 | 73 | ## [1.1.2] - 2022-07-01 74 | 75 | ### Fixed 76 | - Disable project features no longer lead to a 403 Forbidden. (see #61) 77 | - Default annotation classifier no longer keys into a dictionary resulting in a KeyError (#63) 78 | 79 | ## [1.1.1] - 2022-06-27 80 | 81 | ### Fixed 82 | - Fix a memory bloating issue by not using mutable objects as function argument default value (see #60) 83 | - Fix a spelling mistake in `config.formats` to `config.format` 84 | 85 | ## [1.1] - 2022-06-12 86 | 87 | ### Added 88 | - Multi-format support for provenance graph serialization supported by the `multi-format` subcommand. (see #54) 89 | - `outfile` option to section `OUTPUT` in the config file to specify the output file name. 90 | 91 | ### Changed 92 | - README now includes a section about multi-format serialization. 93 | 94 | ## [1.0] - 2022-06-12 95 | 96 | ### Added 97 | - Qualified relations for relationship types `wasGeneratedBy`, `wasInvalidatedBy`, `used`, `wasAssociatedWith` (see #52) 98 | - Command line flag `--v` enables logging to the console. 99 | - Command line flag `--profile` enables profiling of a tool run. 100 | - Test suite. 101 | - Documentation for config file options in the config file example. 102 | - Model documentation including descriptions of each node and relationship with their respective properties. 103 | - `requirements_dev.txt` to install development dependencies. 104 | 105 | ### Fixed 106 | - Releases without evidence no longer lead to IndexError. (see #53) 107 | - Processing parent commits no longer leads to errors. (#37) 108 | - Missing event classifiers are now handled gracefully. (#38) 109 | 110 | ### Changed 111 | - Support comma-seperated lists of URLs for config file option `project_urls`. 112 | - Rename `aliases` to `double_agents` in the config file and command line interface. 113 | - Rename `pseudonymize` to `pseudonymous` in the config file and command line interface. 114 | - Rename the config file sections to match the new CLI subcommands 115 | - Stop manual file history computation. Each file revision now has exactly one previous revision instead of one or more. 116 | - Use the modification model for commit statuses: MODIFIED, RENAMED, COPIED, CHANGED. 117 | - Ignore file revisions with unkown commit status. 118 | - Change event classification to use the package `regex` that supports branch reset expressions. 119 | - Change build process to use `pyproject.toml` aswell as `setup.cfg` 120 | - Change architecture to messagebus in anticipation of future features. 121 | - Bump minimum required `prov` version to 2.0.0 122 | 123 | ### Removed 124 | - Rate limit flag/option `--r`/`rate_limit`. Asynchronicity during retrieval is no longer supported. 125 | - Quiet flag/option `--q`/`quiet`. Default behavior is to be quiet by default unless the `--v` flag is set. 126 | 127 | 128 | ## [0.5] - 2021-06-21 129 | 130 | ### Added 131 | - Support for 'tags' and 'releases' to PROV model (see #39) 132 | - CITATION file in Citation File Format (CFF) (see #49) 133 | 134 | ### Changed 135 | - Revisions (changes) of files expressed in PROV model (see #40) 136 | 137 | ### Fixed 138 | - API client pagination (see #41) 139 | - 'wasGeneratedBy' relation for 'tags' corrected (see #51) 140 | 141 | ### Removed 142 | - Resource names from attributes (see #47, #48) 143 | 144 | ## [0.4.1] - 2021-04-23 145 | ### Added 146 | - Support for wider range of GitLab events. 147 | - Documentation for all new supported events. 148 | 149 | ### Fixed 150 | - Errors when requesting deactivated API endpoints are now handled gracefully. 151 | 152 | ## [0.4] Pre-release - 2021-04-23 153 | ### Added 154 | - Support for additional GitLab events. 155 | 156 | ### Fixed 157 | - Error handling hotfix for requests to deactivated API endpoints. 158 | 159 | ## [0.3] - 2020-10-15 160 | 161 | ### Changed 162 | - Updated PROV models in /docs to reflect changes in the PROV model. 163 | - Updated config file example. 164 | - Update usage section in README. 165 | 166 | ### Fixed 167 | - Updated `setup.py` to resolve an issue for python versions below 3.8. 168 | - Configuration errors are now handled gracefully. 169 | 170 | ### Removed 171 | - Dependency on `prov-db-connector`. Neo4j import functionality is no longer part of this package. 172 | 173 | ## [0.2] - 2020-08-01 174 | 175 | ### Changed 176 | - Project status marked for `gitlab2prov` usage in [@cdboer's](https://github.com/cdboer) bachelor thesis. 177 | 178 | ## [0.1] - 2020-01-22 179 | 180 | ### Added 181 | - Revised README to provide a comprehensive overview of the project's objectives and instructions for usage. 182 | - Initial public version of the gitlab2prov package. 183 | - Preset queries tailored for use on the property graphs produced by the tool, implemented in Neo4j. 184 | - `requirements.txt` file specifying the list of dependencies required to run the tool. 185 | 186 | [unreleased]: https://github.com/dlr-sc/gitlab2prov/compare/v2.2.0...HEAD 187 | [2.2.0]: https://github.com/dlr-sc/gitlab2prov/compare/v2.1.0...v2.2.0 188 | [2.1.0]: https://github.com/dlr-sc/gitlab2prov/compare/v2.0.0...v2.1.0 189 | [2.0.0]: https://github.com/dlr-sc/gitlab2prov/compare/v1.1.4...v2.0.0 190 | [1.1.4]: https://github.com/dlr-sc/gitlab2prov/compare/v1.1.3...v1.1.4 191 | [1.1.3]: https://github.com/dlr-sc/gitlab2prov/compare/v1.1.2...v1.1.3 192 | [1.1.2]: https://github.com/dlr-sc/gitlab2prov/compare/v1.1.1...v1.1.2 193 | [1.1.1]: https://github.com/dlr-sc/gitlab2prov/compare/v1.1...v1.1.1 194 | [1.1]: https://github.com/dlr-sc/gitlab2prov/compare/v1.0...v1.1 195 | [1.0]: https://github.com/dlr-sc/gitlab2prov/compare/v0.5...v1.0 196 | [0.5]: https://github.com/dlr-sc/gitlab2prov/compare/v0.4.1...v0.5 197 | [0.4.1]: https://github.com/dlr-sc/gitlab2prov/compare/v0.4...v0.4.1 198 | [0.4]: https://github.com/dlr-sc/gitlab2prov/compare/v0.3...v0.4 199 | [0.3]: https://github.com/dlr-sc/gitlab2prov/compare/v0.2...v0.3 200 | [0.2]: https://github.com/dlr-sc/gitlab2prov/compare/v0.1...v0.2 201 | [0.1]: https://github.com/dlr-sc/gitlab2prov/releases/tag/v0.1 -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use GitLab2PROV in your research, please cite it using these metadata." 3 | title: GitLab2PROV 4 | abstract: "gitlab2prov is a Python library and command line tool that extracts provenance information from GitLab projects." 5 | type: software 6 | authors: 7 | - family-names: "de Boer" 8 | given-names: Claas 9 | affiliation: "German Aerospace Center (DLR)" 10 | orcid: "https://orcid.org/0000-0002-1841-8099" 11 | - family-names: Schreiber 12 | given-names: Andreas 13 | affiliation: "German Aerospace Center (DLR)" 14 | orcid: "https://orcid.org/0000-0001-5750-5649" 15 | version: "2.2.0" 16 | date-released: "2023-09-05" 17 | license: MIT 18 | repository-code: "https://github.com/DLR-SC/gitlab2prov" 19 | references: 20 | - authors: 21 | - family-names: Schreiber 22 | given-names: Andreas 23 | affiliation: "German Aerospace Center (DLR)" 24 | orcid: "https://orcid.org/0000-0001-5750-5649" 25 | - family-names: "de Boer" 26 | given-names: Claas 27 | affiliation: "German Aerospace Center (DLR)" 28 | orcid: "https://orcid.org/0000-0002-1841-8099" 29 | - family-names: "von Kurnatowski" 30 | given-names: Lynn 31 | affiliation: "German Aerospace Center (DLR)" 32 | orcid: "https://orcid.org/0000-0001-5144-702X" 33 | title: "GitLab2PROV—Provenance of Software Projects hosted on GitLab" 34 | type: conference-paper 35 | conference: 36 | name: "13th International Workshop on Theory and Practice of Provenance (TaPP 2021)" 37 | year: 2021 38 | month: 7 39 | keywords: 40 | - provenance 41 | - git 42 | - "software-analytics" 43 | - gitlab 44 | - "w3c-prov" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 German Aerospace Center (DLR) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Welcome to gitlab2prov! 👋

2 |

3 | 4 | License: MIT 5 | 6 | 7 | Badge: Made with Python 8 | 9 | 10 | Badge: PyPi Version 11 | 12 | 13 | Badge: PyPi Downloads Monthly 14 | 15 | 16 | Twitter: DLR Software 17 | 18 | 19 | Badge: Open in VSCode 20 | 21 | 22 | Badge: DOI 23 | 24 | 25 | Badge: W3C PROV 26 | 27 | 28 | Badge: Citation File Format Inside 29 | 30 |

31 | 32 | 33 | > `gitlab2prov` is a Python library and command line tool that extracts provenance information from GitLab projects. 34 | 35 | --- 36 | 37 | The `gitlab2prov` data model has been designed according to [W3C PROV](https://www.w3.org/TR/prov-overview/) specification. 38 | The model documentation can be found [here](https://github.com/DLR-SC/gitlab2prov/tree/master/docs). 39 | 40 | ## ️🏗️ ️Installation 41 | 42 | Please note that this tool requires Git to be installed on your machine. 43 | 44 | Clone the project and install using `pip`: 45 | ```bash 46 | pip install . 47 | ``` 48 | 49 | Or install the latest release from [PyPi](https://pypi.org/project/gitlab2prov/): 50 | ```bash 51 | pip install gitlab2prov 52 | ``` 53 | 54 | To install `gitlab2prov` with all extra dependencies require the `[dev]` extras: 55 | ```bash 56 | pip install .[dev] # clone repo, install with extras 57 | pip install gitlab2prov[dev] # PyPi, install with extras 58 | ``` 59 | 60 | ## ⚡ Getting started 61 | 62 | `gitlab2prov` needs a [personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) to clone git repositories and to authenticate with the GitLab API. 63 | Follow [this guide](./docs/guides/tokens.md) to create an access token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes). 64 | 65 | 66 | ## 🚀‍ Usage 67 | 68 | `gitlab2prov` can be configured using the command line interface or by providing a configuration file in `.yaml` format. 69 | 70 | ### Command Line Usage 71 | The command line interface consists of commands that can be chained together like a unix pipeline. 72 | 73 | ``` 74 | Usage: gitlab2prov [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 [ARGS]...]... 75 | 76 | Extract provenance information from GitLab projects. 77 | 78 | Options: 79 | --version Show the version and exit. 80 | --verbose Enable logging to 'gitlab2prov.log'. 81 | --config FILE Read config from file. 82 | --validate FILE Validate config file and exit. 83 | --help Show this message and exit. 84 | 85 | Commands: 86 | combine Combine multiple graphs into one. 87 | extract Extract provenance information for one or more... 88 | load Load provenance files. 89 | merge-duplicated-agents Merge duplicated agents based on a name to... 90 | pseudonymize Pseudonymize a provenance graph. 91 | save Save provenance information to a file. 92 | stats Print statistics such as node counts and... 93 | ``` 94 | 95 | ### Configuration Files 96 | `gitlab2prov` supports configuration files in `.yaml` format that are functionally equivalent to command line invocations. 97 | 98 | To read configuration details from a file instead of specifying on the command line, use the `--config` option: 99 | ```ini 100 | # initiate a run using a config file 101 | gitlab2prov --config config/example.yaml 102 | ``` 103 | You can validate your config file using the provided JSON-Schema `gitlab2prov/config/schema.json` that comes packaged with every installation: 104 | ```ini 105 | # check config file for syntactical errors 106 | gitlab2prov --validate config/example.yaml 107 | ``` 108 | 109 | Config file example: 110 | 111 | ```yaml 112 | - extract: 113 | url: ["https://gitlab.com/example/foo"] 114 | token: tokenA 115 | - extract: 116 | url: ["https://gitlab.com/example/bar"] 117 | token: tokenB 118 | - load: 119 | input: [example.rdf] 120 | - pseudonymize: 121 | - combine: 122 | - save: 123 | output: combined 124 | format: [json, rdf, xml, dot] 125 | - stats: 126 | fine: true 127 | explain: true 128 | formatter: table 129 | ``` 130 | 131 | The config file example is functionally equivalent to this command line invocation: 132 | 133 | ``` 134 | gitlab2prov extract -u https://gitlab.com/example/foo -t tokenFoo \ 135 | extract -u https://gitlab.com/example/bar -t tokenBar \ 136 | load -i example.rdf \ 137 | pseudonymize \ 138 | combine \ 139 | save -o combined -f json -f rdf -f xml -f dot \ 140 | stats --fine --explain --formatter table 141 | ``` 142 | 143 | ### 🎨 Provenance Output Formats 144 | 145 | `gitlab2prov` supports output formats that the [`prov`](https://github.com/trungdong/prov) library provides: 146 | * [PROV-N](http://www.w3.org/TR/prov-n/) 147 | * [PROV-O](http://www.w3.org/TR/prov-o/) (RDF) 148 | * [PROV-XML](http://www.w3.org/TR/prov-xml/) 149 | * [PROV-JSON](http://www.w3.org/Submission/prov-json/) 150 | * [Graphviz](https://graphviz.org/) (DOT) 151 | 152 | ## 🤝 Contributing 153 | 154 | Contributions and pull requests are welcome! 155 | For major changes, please open an issue first to discuss what you would like to change. 156 | 157 | ## ✨ How to cite 158 | 159 | If you use GitLab2PROV in a scientific publication, we would appreciate citations to the following paper: 160 | 161 | * Schreiber, A., de Boer, C. and von Kurnatowski, L. (2021). [GitLab2PROV—Provenance of Software Projects hosted on GitLab](https://www.usenix.org/conference/tapp2021/presentation/schreiber). 13th International Workshop on Theory and Practice of Provenance (TaPP 2021), USENIX Association 162 | 163 | Bibtex entry: 164 | 165 | ```BibTeX 166 | @InProceedings{SchreiberBoerKurnatowski2021, 167 | author = {Andreas Schreiber and Claas de~Boer and Lynn von~Kurnatowski}, 168 | booktitle = {13th International Workshop on Theory and Practice of Provenance (TaPP 2021)}, 169 | title = {{GitLab2PROV}{\textemdash}Provenance of Software Projects hosted on GitLab}, 170 | year = {2021}, 171 | month = jul, 172 | publisher = {{USENIX} Association}, 173 | url = {https://www.usenix.org/conference/tapp2021/presentation/schreiber}, 174 | } 175 | ``` 176 | 177 | You can also cite specific releases published on Zenodo: [![DOI](https://zenodo.org/badge/215042878.svg)](https://zenodo.org/badge/latestdoi/215042878) 178 | 179 | ## ✏️ References 180 | 181 | **Influencial Software for `gitlab2prov`** 182 | * Martin Stoffers: "Gitlab2Graph", v1.0.0, October 13. 2019, [GitHub Link](https://github.com/DLR-SC/Gitlab2Graph), DOI 10.5281/zenodo.3469385 183 | 184 | * Quentin Pradet: "How do you rate limit calls with aiohttp?", [GitHub Gist](https://gist.github.com/pquentin/5d8f5408cdad73e589d85ba509091741), MIT LICENSE 185 | 186 | **Influencial Papers for `gitlab2prov`**: 187 | 188 | * De Nies, T., Magliacane, S., Verborgh, R., Coppens, S., Groth, P., Mannens, E., and Van de Walle, R. (2013). [Git2PROV: Exposing Version Control System Content as W3C PROV](https://dl.acm.org/doi/abs/10.5555/2874399.2874431). In *Poster and Demo Proceedings of the 12th International Semantic Web Conference* (Vol. 1035, pp. 125–128). 189 | 190 | * Packer, H. S., Chapman, A., and Carr, L. (2019). [GitHub2PROV: provenance for supporting software project management](https://dl.acm.org/doi/10.5555/3359032.3359039). In *11th International Workshop on Theory and Practice of Provenance (TaPP 2019)*. 191 | 192 | **Papers that refer to `gitlab2prov`**: 193 | 194 | * Andreas Schreiber, Claas de Boer (2020). [Modelling Knowledge about Software Processes using Provenance Graphs and its Application to Git-based VersionControl Systems](https://dl.acm.org/doi/10.1145/3387940.3392220). In *ICSEW'20: Proceedings of the IEEE/ACM 42nd Conference on Software Engineering Workshops* (pp. 358–359). 195 | 196 | * Tim Sonnekalb, Thomas S. Heinze, Lynn von Kurnatowski, Andreas Schreiber, Jesus M. Gonzalez-Barahona, and Heather Packer (2020). [Towards automated, provenance-driven security audit for git-based repositories: applied to germany's corona-warn-app: vision paper](https://doi.org/10.1145/3416507.3423190). In *Proceedings of the 3rd ACM SIGSOFT International Workshop on Software Security from Design to Deployment* (pp. 15–18). 197 | 198 | * Andreas Schreiber (2020). [Visualization of contributions to open-source projects](https://doi.org/10.1145/3430036.3430057). In *Proceedings of the 13th International Symposium on Visual Information Communication and Interaction*. ACM, USA. 199 | 200 | ## 📜 Dependencies 201 | `gitlab2prov` depends on several open source packages that are made freely available under their respective licenses. 202 | 203 | | Package | License | 204 | | --------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | 205 | | [GitPython](https://github.com/gitpython-developers/GitPython) | [![License](https://img.shields.io/badge/License-BSD_3--Clause-orange.svg)](https://opensource.org/licenses/BSD-3-Clause) | 206 | | [click](https://github.com/pallets/click) | [![License](https://img.shields.io/badge/License-BSD_3--Clause-orange.svg)](https://opensource.org/licenses/BSD-3-Clause) | 207 | | [python-gitlab](https://github.com/python-gitlab/python-gitlab) | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL_v3-blue.svg)](https://www.gnu.org/licenses/lgpl-3.0) | 208 | | [prov](https://pypi.org/project/prov/) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | 209 | | [jsonschema](https://github.com/python-jsonschema/jsonschema) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | 210 | | [ruamel.yaml](https://pypi.org/project/ruamel.yaml/) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | 211 | | [pydot](https://github.com/pydot/pydot) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | 212 | 213 | ## 📝 License 214 | This project is [MIT](https://github.com/dlr-sc/gitlab2prov/blob/master/LICENSE) licensed. 215 | Copyright © 2019 German Aerospace Center (DLR) and individual contributors. 216 | -------------------------------------------------------------------------------- /config/example.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=../gitlab2prov/config/schema.json 2 | - extract: 3 | url: ["https://gitlab.com/example/foo"] 4 | token: tokenFoo 5 | - extract: 6 | url: ["https://gitlab.com/example/bar"] 7 | token: tokenBar 8 | - load: 9 | input: [example.rdf] 10 | - pseudonymize: 11 | - combine: 12 | - save: 13 | output: combined 14 | format: [json, rdf, xml, dot] 15 | - stats: 16 | fine: true 17 | explain: true 18 | formatter: table -------------------------------------------------------------------------------- /docs/figures.py: -------------------------------------------------------------------------------- 1 | """PROV model fpr GitLab2PROV.""" 2 | 3 | __author__ = "Claas de Boer, Andreas Schreiber, Lynn von Kurnatowski" 4 | __copyright__ = "Copyright 2020, German Aerospace Center (DLR) and individual contributors" 5 | __license__ = "MIT" 6 | __version__ = "1.0" 7 | __status__ = "Stable" 8 | 9 | 10 | from prov.model import ProvDocument 11 | from prov.dot import prov_to_dot 12 | 13 | 14 | add = ProvDocument() 15 | add.set_default_namespace("gitlab2prov:") 16 | add.activity("Commit") 17 | add.activity("Parent Commit") 18 | add.agent("Committer") 19 | add.agent("Author") 20 | add.entity("File") 21 | add.entity("File Revision") 22 | add.wasInformedBy("Commit", "Parent Commit") 23 | add.wasAssociatedWith("Commit", "Committer") 24 | add.wasAssociatedWith("Commit", "Author") 25 | add.wasGeneratedBy("File", "Commit") 26 | add.wasGeneratedBy("File Revision", "Commit") 27 | add.wasAttributedTo("File", "Author") 28 | add.wasAttributedTo("File Revision", "Author") 29 | add.specializationOf("File Revision", "File") 30 | 31 | 32 | mod = ProvDocument() 33 | mod.set_default_namespace("gitlab2prov:") 34 | mod.activity("Commit") 35 | mod.activity("Parent Commit") 36 | mod.agent("Committer") 37 | mod.agent("Author") 38 | mod.entity("File") 39 | mod.entity("File Revision") 40 | mod.entity("Previous File Revision") 41 | mod.wasInformedBy("Commit", "Parent Commit") 42 | mod.wasAssociatedWith("Commit", "Author") 43 | mod.wasAssociatedWith("Commit", "Committer") 44 | mod.used("Commit", "Previous File Revision") 45 | mod.wasGeneratedBy("File Revision", "Commit") 46 | mod.wasRevisionOf("File Revision", "Previous File Revision") 47 | mod.specializationOf("File Revision", "File") 48 | mod.specializationOf("Previous File Revision", "File") 49 | mod.wasAttributedTo("File Revision", "Author") 50 | 51 | 52 | rem = ProvDocument() 53 | rem.set_default_namespace("gitlab2prov:") 54 | rem.activity("Commit") 55 | rem.activity("Parent Commit") 56 | rem.agent("Committer") 57 | rem.agent("Author") 58 | rem.entity("File") 59 | rem.entity("File Revision") 60 | rem.wasInformedBy("Commit", "Parent Commit") 61 | rem.wasAssociatedWith("Commit", "Committer") 62 | rem.wasAssociatedWith("Commit", "Author") 63 | rem.wasInvalidatedBy("File Revision", "Commit") 64 | rem.specializationOf("File Revision", "File") 65 | 66 | 67 | com = ProvDocument() 68 | com.set_default_namespace("gitlab2prov:") 69 | com.agent("Gitlab Commit Author") 70 | com.agent("Annotator") 71 | com.activity("Creation") 72 | com.activity("Annotation") 73 | com.activity("Git Commit") 74 | com.wasInformedBy("Creation", "Git Commit") 75 | com.entity("Commit") 76 | com.entity("Commit Version") 77 | com.entity("Annotated Commit Version") 78 | com.wasAssociatedWith("Creation", "Gitlab Commit Author") 79 | com.wasAttributedTo("Commit", "Gitlab Commit Author") 80 | com.wasAttributedTo("Commit Version", "Gitlab Commit Author") 81 | com.wasGeneratedBy("Commit", "Creation") 82 | com.wasGeneratedBy("Commit Version", "Creation") 83 | com.wasAttributedTo("Annotated Commit Version", "Annotator") 84 | com.wasAssociatedWith("Annotation", "Annotator") 85 | com.used("Annotation", "Commit Version") 86 | com.wasInformedBy("Annotation", "Creation") 87 | com.wasGeneratedBy("Annotated Commit Version", "Annotation") 88 | com.specializationOf("Commit Version", "Commit") 89 | com.specializationOf("Annotated Commit Version", "Commit") 90 | com.wasDerivedFrom("Annotated Commit Version", "Commit Version") 91 | 92 | 93 | mr = ProvDocument() 94 | mr.set_default_namespace("gitlab2prov:") 95 | mr.agent("Merge Request Author") 96 | mr.agent("Annotator") 97 | mr.activity("Creation") 98 | mr.activity("Annotation") 99 | mr.entity("Merge Request") 100 | mr.entity("Merge Request Version") 101 | mr.entity("Annotated Merge Request Version") 102 | mr.wasInformedBy("Annotation", "Creation") 103 | mr.wasGeneratedBy("Merge Request", "Creation") 104 | mr.wasGeneratedBy("Merge Request Version", "Creation") 105 | mr.wasGeneratedBy("Annotated Merge Request Version", "Annotation") 106 | mr.used("Annotation", "Merge Request Version") 107 | mr.specializationOf("Merge Request Version", "Merge Request") 108 | mr.specializationOf("Annotated Merge Request Version", "Merge Request") 109 | mr.wasDerivedFrom("Annotated Merge Request Version", "Merge Request Version") 110 | mr.wasAttributedTo("Annotated Merge Request Version", "Annotator") 111 | mr.wasAttributedTo("Merge Request Version", "Merge Request Author") 112 | mr.wasAttributedTo("Merge Request", "Merge Request Author") 113 | mr.wasAssociatedWith("Creation", "Merge Request Author") 114 | mr.wasAssociatedWith("Annotation", "Annotator") 115 | 116 | 117 | iss = ProvDocument() 118 | iss.set_default_namespace("gitlab2prov:") 119 | iss.agent("Issue Author") 120 | iss.agent("Annotator") 121 | iss.activity("Creation") 122 | iss.activity("Annotation") 123 | iss.entity("Issue") 124 | iss.entity("Issue Version") 125 | iss.entity("Annotated Issue Version") 126 | iss.wasInformedBy("Annotation", "Creation") 127 | iss.wasGeneratedBy("Issue", "Creation") 128 | iss.wasGeneratedBy("Issue Version", "Creation") 129 | iss.wasGeneratedBy("Annotated Issue Version", "Annotation") 130 | iss.used("Annotation", "Issue Version") 131 | iss.specializationOf("Issue Version", "Issue") 132 | iss.specializationOf("Annotated Issue Version", "Issue") 133 | iss.wasDerivedFrom("Annotated Issue Version", "Issue Version") 134 | iss.wasAttributedTo("Annotated Issue Version", "Annotator") 135 | iss.wasAttributedTo("Issue Version", "Issue Author") 136 | iss.wasAttributedTo("Issue", "Issue Author") 137 | iss.wasAssociatedWith("Creation", "Issue Author") 138 | iss.wasAssociatedWith("Annotation", "Annotator") 139 | 140 | 141 | release_tag_model = ProvDocument() 142 | release_tag_model.set_default_namespace("gitlab2prov:") 143 | release_tag_model.agent("Release Author") 144 | release_tag_model.agent("Tag Author") 145 | release_tag_model.agent("Author") 146 | 147 | release_tag_model.activity("Release Creation") 148 | release_tag_model.activity("Tag Creation") 149 | release_tag_model.activity("Commit Creation") 150 | release_tag_model.entity("Tag") 151 | release_tag_model.entity("Release") 152 | release_tag_model.entity("Commit") 153 | release_tag_model.entity("Evidence") 154 | release_tag_model.entity("Asset") 155 | release_tag_model.hadMember("Asset", "Release") 156 | release_tag_model.hadMember("Evidence", "Release") 157 | release_tag_model.hadMember("Tag", "Release") 158 | release_tag_model.hadMember("Commit", "Tag") 159 | release_tag_model.wasAssociatedWith("Commit Creation", "Author") 160 | release_tag_model.wasAssociatedWith("Release Creation", "Release Author") 161 | release_tag_model.wasAssociatedWith("Tag Creation", "Tag Author") 162 | release_tag_model.wasAttributedTo("Release", "Release Author") 163 | release_tag_model.wasAttributedTo("Tag", "Tag Author") 164 | release_tag_model.wasAttributedTo("Commit", "Author") 165 | release_tag_model.wasGeneratedBy("Release", "Release Creation") 166 | release_tag_model.wasGeneratedBy("Tag", "Tag Creation") 167 | release_tag_model.wasGeneratedBy("Commit", "Commit Creation") 168 | 169 | 170 | for title, doc in [ 171 | ("git_commit_model_add", add), 172 | ("git_commit_model_mod", mod), 173 | ("git_commit_model_del", rem), 174 | ("gitlab_commit_model", com), 175 | ("gitlab_issue_model", iss), 176 | ("gitlab_merge_request_model", mr), 177 | ("gitlab_release_tag_model", release_tag_model) 178 | ]: 179 | dot = prov_to_dot(doc, show_nary=False, use_labels=False, direction="BT") 180 | dot.set_graph_defaults(bgcolor="transparent") 181 | dot.write_pdf( 182 | f"./pdfs/{title}.pdf" 183 | ) 184 | dot = prov_to_dot(doc, show_nary=False, use_labels=False, direction="BT") 185 | dot.set_graph_defaults(bgcolor="transparent") 186 | dot.write_svg( 187 | f"./svgs/{title}.svg" 188 | ) -------------------------------------------------------------------------------- /docs/guides/tokens.md: -------------------------------------------------------------------------------- 1 | # Create a personal access token (GitLab) 2 | 3 | 4 | ### 1. Go to the web interface of your GitLab instance 5 | 6 | 7 | ### 2. Click on View profile and more 8 | ![Step 2 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/cf0c2f38-fd09-4008-a360-f12ea523fce4/8ad37171-f211-4959-b65f-5ee8e04c22b8.png?crop=focalpoint&fit=crop&fp-x=0.9200&fp-y=0.0782&fp-z=3.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 9 | 10 | 11 | ### 3. Click on Preferences 12 | ![Step 3 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/762f708b-07a3-4464-8f84-111a9933f91e/9c2a5c82-ec9d-47f4-86ae-80224256525c.png?crop=focalpoint&fit=crop&fp-x=0.8845&fp-y=0.1942&fp-z=3.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 13 | 14 | 15 | ### 4. Click on Access Tokens 16 | ![Step 4 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/41c81b49-af90-4ff0-afd1-4ed8db7c7ba9/bdb0112f-19fd-4409-96c4-f6f7caec4d66.png?crop=focalpoint&fit=crop&fp-x=0.5000&fp-y=0.5000&fp-z=1.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=1114%3A680) 17 | 18 | 19 | ### 5. Assign a name to your token to remember its purpose. 20 | ![Step 5 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/11f0e0e5-1cb7-4f71-b62a-9433347314f2/4d95c458-e74b-45f8-84ef-18c2812fa347.png?crop=focalpoint&fit=crop&fp-x=0.5525&fp-y=0.2498&fp-z=2.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 21 | 22 | 23 | ### 6. Optionally: Choose an expiration date for your token 24 | 25 | 26 | ### 7. Check read_api… 27 | ![Step 7 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/2d6b09f3-c5f8-4ac0-80ae-012289b4685f/78ab3230-8bf0-42b4-bf69-d5514d3a4570.png?crop=focalpoint&fit=crop&fp-x=0.5384&fp-y=0.3226&fp-z=2.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 28 | 29 | 30 | ### 8. Check read_repository… 31 | ![Step 8 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/a3ccc72a-56a2-4257-8e48-6a00f8764137/d460fe84-10a1-4b76-88a3-b5397896195a.png?crop=focalpoint&fit=crop&fp-x=0.5660&fp-y=0.3966&fp-z=2.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 32 | 33 | 34 | ### 9. Click on Create personal access token 35 | ![Step 9 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/c447e522-ab55-413f-af91-ffe35218c7b6/3cefe682-242c-420e-b143-73d7aa344f6c.png?crop=focalpoint&fit=crop&fp-x=0.5502&fp-y=0.4047&fp-z=2.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 36 | 37 | 38 | ### 10. Click on Copy personal access token to copy the token to your clipboard 39 | ![Step 10 screenshot](https://images.tango.us/workflows/3e9025d4-5926-4af0-9359-f8ed8ceb0342/steps/4a7fbc56-e9f6-4bf2-8da2-e393e169cfc0/7fe39bf9-834c-4759-8e88-311705e967af.png?crop=focalpoint&fit=crop&fp-x=0.5527&fp-y=0.2498&fp-z=2.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) 40 | 41 | ### 11. Done! -------------------------------------------------------------------------------- /docs/issue-thread.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/issue-thread.png -------------------------------------------------------------------------------- /docs/pdfs/git_commit_model_add.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/git_commit_model_add.pdf -------------------------------------------------------------------------------- /docs/pdfs/git_commit_model_del.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/git_commit_model_del.pdf -------------------------------------------------------------------------------- /docs/pdfs/git_commit_model_mod.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/git_commit_model_mod.pdf -------------------------------------------------------------------------------- /docs/pdfs/gitlab_commit_model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/gitlab_commit_model.pdf -------------------------------------------------------------------------------- /docs/pdfs/gitlab_issue_model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/gitlab_issue_model.pdf -------------------------------------------------------------------------------- /docs/pdfs/gitlab_merge_request_model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/gitlab_merge_request_model.pdf -------------------------------------------------------------------------------- /docs/pdfs/gitlab_release_tag_model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/docs/pdfs/gitlab_release_tag_model.pdf -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | prov 2 | -------------------------------------------------------------------------------- /docs/svgs/git_commit_model_add.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Commit 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Parent Commit 26 | 27 | 28 | 29 | 30 | 31 | n1->n2 32 | 33 | 34 | wasInformedBy 35 | 36 | 37 | 38 | n3 39 | 40 | 41 | Committer 42 | 43 | 44 | 45 | 46 | 47 | n1->n3 48 | 49 | 50 | wasAssociatedWith 51 | 52 | 53 | 54 | n4 55 | 56 | 57 | Author 58 | 59 | 60 | 61 | 62 | 63 | n1->n4 64 | 65 | 66 | wasAssociatedWith 67 | 68 | 69 | 70 | n5 71 | 72 | 73 | File 74 | 75 | 76 | 77 | 78 | 79 | n5->n1 80 | 81 | 82 | wasGeneratedBy 83 | 84 | 85 | 86 | n5->n4 87 | 88 | 89 | wasAttributedTo 90 | 91 | 92 | 93 | n6 94 | 95 | 96 | File Revision 97 | 98 | 99 | 100 | 101 | 102 | n6->n1 103 | 104 | 105 | wasGeneratedBy 106 | 107 | 108 | 109 | n6->n4 110 | 111 | 112 | wasAttributedTo 113 | 114 | 115 | 116 | n6->n5 117 | 118 | 119 | specializationOf 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /docs/svgs/git_commit_model_del.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Commit 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Parent Commit 26 | 27 | 28 | 29 | 30 | 31 | n1->n2 32 | 33 | 34 | wasInformedBy 35 | 36 | 37 | 38 | n3 39 | 40 | 41 | Committer 42 | 43 | 44 | 45 | 46 | 47 | n1->n3 48 | 49 | 50 | wasAssociatedWith 51 | 52 | 53 | 54 | n4 55 | 56 | 57 | Author 58 | 59 | 60 | 61 | 62 | 63 | n1->n4 64 | 65 | 66 | wasAssociatedWith 67 | 68 | 69 | 70 | n5 71 | 72 | 73 | File 74 | 75 | 76 | 77 | 78 | 79 | n6 80 | 81 | 82 | File Revision 83 | 84 | 85 | 86 | 87 | 88 | n6->n1 89 | 90 | 91 | wasInvalidatedBy 92 | 93 | 94 | 95 | n6->n5 96 | 97 | 98 | specializationOf 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /docs/svgs/git_commit_model_mod.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Commit 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Parent Commit 26 | 27 | 28 | 29 | 30 | 31 | n1->n2 32 | 33 | 34 | wasInformedBy 35 | 36 | 37 | 38 | n3 39 | 40 | 41 | Committer 42 | 43 | 44 | 45 | 46 | 47 | n1->n3 48 | 49 | 50 | wasAssociatedWith 51 | 52 | 53 | 54 | n4 55 | 56 | 57 | Author 58 | 59 | 60 | 61 | 62 | 63 | n1->n4 64 | 65 | 66 | wasAssociatedWith 67 | 68 | 69 | 70 | n7 71 | 72 | 73 | Previous File Revision 74 | 75 | 76 | 77 | 78 | 79 | n1->n7 80 | 81 | 82 | used 83 | 84 | 85 | 86 | n5 87 | 88 | 89 | File 90 | 91 | 92 | 93 | 94 | 95 | n6 96 | 97 | 98 | File Revision 99 | 100 | 101 | 102 | 103 | 104 | n6->n1 105 | 106 | 107 | wasGeneratedBy 108 | 109 | 110 | 111 | n6->n4 112 | 113 | 114 | wasAttributedTo 115 | 116 | 117 | 118 | n6->n5 119 | 120 | 121 | specializationOf 122 | 123 | 124 | 125 | b1 126 | 127 | 128 | 129 | 130 | n6->b1 131 | 132 | wasDerivedFrom 133 | 134 | 135 | 136 | n7->n5 137 | 138 | 139 | specializationOf 140 | 141 | 142 | 143 | b1->n7 144 | 145 | 146 | 147 | 148 | 149 | ann1 150 | 151 | 152 | 153 | 154 | prov:type 155 | 156 | 157 | 158 | prov:Revision 159 | 160 | 161 | 162 | 163 | 164 | ann1->b1 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /docs/svgs/gitlab_commit_model.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Gitlab Commit Author 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Annotator 26 | 27 | 28 | 29 | 30 | 31 | n3 32 | 33 | 34 | Creation 35 | 36 | 37 | 38 | 39 | 40 | n3->n1 41 | 42 | 43 | wasAssociatedWith 44 | 45 | 46 | 47 | n5 48 | 49 | 50 | Git Commit 51 | 52 | 53 | 54 | 55 | 56 | n3->n5 57 | 58 | 59 | wasInformedBy 60 | 61 | 62 | 63 | n4 64 | 65 | 66 | Annotation 67 | 68 | 69 | 70 | 71 | 72 | n4->n2 73 | 74 | 75 | wasAssociatedWith 76 | 77 | 78 | 79 | n4->n3 80 | 81 | 82 | wasInformedBy 83 | 84 | 85 | 86 | n7 87 | 88 | 89 | Commit Version 90 | 91 | 92 | 93 | 94 | 95 | n4->n7 96 | 97 | 98 | used 99 | 100 | 101 | 102 | n6 103 | 104 | 105 | Commit 106 | 107 | 108 | 109 | 110 | 111 | n6->n1 112 | 113 | 114 | wasAttributedTo 115 | 116 | 117 | 118 | n6->n3 119 | 120 | 121 | wasGeneratedBy 122 | 123 | 124 | 125 | n7->n1 126 | 127 | 128 | wasAttributedTo 129 | 130 | 131 | 132 | n7->n3 133 | 134 | 135 | wasGeneratedBy 136 | 137 | 138 | 139 | n7->n6 140 | 141 | 142 | specializationOf 143 | 144 | 145 | 146 | n8 147 | 148 | 149 | Annotated Commit Version 150 | 151 | 152 | 153 | 154 | 155 | n8->n2 156 | 157 | 158 | wasAttributedTo 159 | 160 | 161 | 162 | n8->n4 163 | 164 | 165 | wasGeneratedBy 166 | 167 | 168 | 169 | n8->n6 170 | 171 | 172 | specializationOf 173 | 174 | 175 | 176 | n8->n7 177 | 178 | 179 | wasDerivedFrom 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /docs/svgs/gitlab_issue_model.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Issue Author 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Annotator 26 | 27 | 28 | 29 | 30 | 31 | n3 32 | 33 | 34 | Creation 35 | 36 | 37 | 38 | 39 | 40 | n3->n1 41 | 42 | 43 | wasAssociatedWith 44 | 45 | 46 | 47 | n4 48 | 49 | 50 | Annotation 51 | 52 | 53 | 54 | 55 | 56 | n4->n2 57 | 58 | 59 | wasAssociatedWith 60 | 61 | 62 | 63 | n4->n3 64 | 65 | 66 | wasInformedBy 67 | 68 | 69 | 70 | n6 71 | 72 | 73 | Issue Version 74 | 75 | 76 | 77 | 78 | 79 | n4->n6 80 | 81 | 82 | used 83 | 84 | 85 | 86 | n5 87 | 88 | 89 | Issue 90 | 91 | 92 | 93 | 94 | 95 | n5->n1 96 | 97 | 98 | wasAttributedTo 99 | 100 | 101 | 102 | n5->n3 103 | 104 | 105 | wasGeneratedBy 106 | 107 | 108 | 109 | n6->n1 110 | 111 | 112 | wasAttributedTo 113 | 114 | 115 | 116 | n6->n3 117 | 118 | 119 | wasGeneratedBy 120 | 121 | 122 | 123 | n6->n5 124 | 125 | 126 | specializationOf 127 | 128 | 129 | 130 | n7 131 | 132 | 133 | Annotated Issue Version 134 | 135 | 136 | 137 | 138 | 139 | n7->n2 140 | 141 | 142 | wasAttributedTo 143 | 144 | 145 | 146 | n7->n4 147 | 148 | 149 | wasGeneratedBy 150 | 151 | 152 | 153 | n7->n5 154 | 155 | 156 | specializationOf 157 | 158 | 159 | 160 | n7->n6 161 | 162 | 163 | wasDerivedFrom 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /docs/svgs/gitlab_merge_request_model.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Merge Request Author 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Annotator 26 | 27 | 28 | 29 | 30 | 31 | n3 32 | 33 | 34 | Creation 35 | 36 | 37 | 38 | 39 | 40 | n3->n1 41 | 42 | 43 | wasAssociatedWith 44 | 45 | 46 | 47 | n4 48 | 49 | 50 | Annotation 51 | 52 | 53 | 54 | 55 | 56 | n4->n2 57 | 58 | 59 | wasAssociatedWith 60 | 61 | 62 | 63 | n4->n3 64 | 65 | 66 | wasInformedBy 67 | 68 | 69 | 70 | n6 71 | 72 | 73 | Merge Request Version 74 | 75 | 76 | 77 | 78 | 79 | n4->n6 80 | 81 | 82 | used 83 | 84 | 85 | 86 | n5 87 | 88 | 89 | Merge Request 90 | 91 | 92 | 93 | 94 | 95 | n5->n1 96 | 97 | 98 | wasAttributedTo 99 | 100 | 101 | 102 | n5->n3 103 | 104 | 105 | wasGeneratedBy 106 | 107 | 108 | 109 | n6->n1 110 | 111 | 112 | wasAttributedTo 113 | 114 | 115 | 116 | n6->n3 117 | 118 | 119 | wasGeneratedBy 120 | 121 | 122 | 123 | n6->n5 124 | 125 | 126 | specializationOf 127 | 128 | 129 | 130 | n7 131 | 132 | 133 | Annotated Merge Request Version 134 | 135 | 136 | 137 | 138 | 139 | n7->n2 140 | 141 | 142 | wasAttributedTo 143 | 144 | 145 | 146 | n7->n4 147 | 148 | 149 | wasGeneratedBy 150 | 151 | 152 | 153 | n7->n5 154 | 155 | 156 | specializationOf 157 | 158 | 159 | 160 | n7->n6 161 | 162 | 163 | wasDerivedFrom 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /docs/svgs/gitlab_release_tag_model.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | n1 14 | 15 | 16 | Release Author 17 | 18 | 19 | 20 | 21 | 22 | n2 23 | 24 | 25 | Tag Author 26 | 27 | 28 | 29 | 30 | 31 | n3 32 | 33 | 34 | Author 35 | 36 | 37 | 38 | 39 | 40 | n4 41 | 42 | 43 | Release Creation 44 | 45 | 46 | 47 | 48 | 49 | n4->n1 50 | 51 | 52 | wasAssociatedWith 53 | 54 | 55 | 56 | n5 57 | 58 | 59 | Tag Creation 60 | 61 | 62 | 63 | 64 | 65 | n5->n2 66 | 67 | 68 | wasAssociatedWith 69 | 70 | 71 | 72 | n6 73 | 74 | 75 | Commit Creation 76 | 77 | 78 | 79 | 80 | 81 | n6->n3 82 | 83 | 84 | wasAssociatedWith 85 | 86 | 87 | 88 | n7 89 | 90 | 91 | Tag 92 | 93 | 94 | 95 | 96 | 97 | n7->n2 98 | 99 | 100 | wasAttributedTo 101 | 102 | 103 | 104 | n7->n5 105 | 106 | 107 | wasGeneratedBy 108 | 109 | 110 | 111 | n8 112 | 113 | 114 | Release 115 | 116 | 117 | 118 | 119 | 120 | n7->n8 121 | 122 | 123 | hadMember 124 | 125 | 126 | 127 | n8->n1 128 | 129 | 130 | wasAttributedTo 131 | 132 | 133 | 134 | n8->n4 135 | 136 | 137 | wasGeneratedBy 138 | 139 | 140 | 141 | n9 142 | 143 | 144 | Commit 145 | 146 | 147 | 148 | 149 | 150 | n9->n3 151 | 152 | 153 | wasAttributedTo 154 | 155 | 156 | 157 | n9->n6 158 | 159 | 160 | wasGeneratedBy 161 | 162 | 163 | 164 | n9->n7 165 | 166 | 167 | hadMember 168 | 169 | 170 | 171 | n10 172 | 173 | 174 | Evidence 175 | 176 | 177 | 178 | 179 | 180 | n10->n8 181 | 182 | 183 | hadMember 184 | 185 | 186 | 187 | n11 188 | 189 | 190 | Asset 191 | 192 | 193 | 194 | 195 | 196 | n11->n8 197 | 198 | 199 | hadMember 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /gitlab2prov/__init__.py: -------------------------------------------------------------------------------- 1 | """Extract provenance from GitLab projects.""" 2 | 3 | __author__ = "Claas de Boer, Andreas Schreiber" 4 | __copyright__ = "Copyright 2020, German Aerospace Center (DLR) and individual contributors" 5 | __license__ = "MIT" 6 | __version__ = "2.2.0" 7 | __status__ = "Development" 8 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/gitlab2prov/adapters/__init__.py -------------------------------------------------------------------------------- /gitlab2prov/adapters/fetch/__init__.py: -------------------------------------------------------------------------------- 1 | from gitlab2prov.adapters.fetch.git import GitFetcher 2 | from gitlab2prov.adapters.fetch.gitlab import GitlabFetcher 3 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/fetch/annotations/__init__.py: -------------------------------------------------------------------------------- 1 | from gitlab2prov.adapters.fetch.annotations.classifiers import CLASSIFIERS 2 | from gitlab2prov.adapters.fetch.annotations.classifiers import IMPORT_STATEMENT 3 | from gitlab2prov.adapters.fetch.annotations.classifiers import AnnotationClassifier 4 | from gitlab2prov.adapters.fetch.annotations.parse import parse_annotations 5 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/fetch/annotations/parse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import operator 3 | import uuid 4 | from typing import Any 5 | from typing import Callable 6 | from typing import Sequence 7 | from typing import TypeAlias 8 | 9 | from gitlab.v4.objects import ProjectCommitComment 10 | from gitlab.v4.objects import ProjectIssueAwardEmoji 11 | from gitlab.v4.objects import ProjectIssueNote 12 | from gitlab.v4.objects import ProjectIssueNoteAwardEmoji 13 | from gitlab.v4.objects import ProjectIssueResourceLabelEvent 14 | from gitlab.v4.objects import ProjectMergeRequestAwardEmoji 15 | from gitlab.v4.objects import ProjectMergeRequestNote 16 | from gitlab.v4.objects import ProjectMergeRequestNoteAwardEmoji 17 | from gitlab.v4.objects import ProjectMergeRequestResourceLabelEvent 18 | 19 | from gitlab2prov.adapters.fetch.annotations import AnnotationClassifier 20 | from gitlab2prov.adapters.fetch.annotations import CLASSIFIERS 21 | from gitlab2prov.adapters.fetch.annotations import IMPORT_STATEMENT 22 | from gitlab2prov.domain.constants import ProvRole 23 | from gitlab2prov.domain.objects import Annotation 24 | from gitlab2prov.domain.objects import User 25 | 26 | 27 | log = logging.getLogger(__name__) 28 | 29 | 30 | DEFAULT = "default_annotation" 31 | 32 | 33 | Comment: TypeAlias = ProjectCommitComment 34 | Note: TypeAlias = ProjectIssueNote | ProjectMergeRequestNote 35 | Label: TypeAlias = ProjectIssueResourceLabelEvent | ProjectMergeRequestResourceLabelEvent 36 | AwardEmoji: TypeAlias = ( 37 | ProjectIssueAwardEmoji 38 | | ProjectIssueNoteAwardEmoji 39 | | ProjectMergeRequestAwardEmoji 40 | | ProjectMergeRequestNoteAwardEmoji 41 | ) 42 | 43 | 44 | def normalize(string: str) -> str: 45 | return string.strip().lower() 46 | 47 | 48 | def longest_matching_classifier(string: str) -> AnnotationClassifier | None: 49 | matching = (cls for cls in CLASSIFIERS if cls.matches(string)) 50 | return max(matching, key=len, default=None) 51 | 52 | 53 | def classify_system_note(string: str) -> tuple[str, dict[str, Any]]: 54 | string = normalize(string) 55 | kwargs = {} 56 | # remove import statement, if present 57 | if IMPORT_STATEMENT.matches(string): 58 | string = IMPORT_STATEMENT.replace(string) 59 | kwargs = IMPORT_STATEMENT.groupdict() 60 | # find classifier by choosing the one with the longest match 61 | if matching_classifier := longest_matching_classifier(string): 62 | kwargs.update(matching_classifier.groupdict()) 63 | return matching_classifier.name, kwargs 64 | return DEFAULT, kwargs 65 | 66 | 67 | def parse_system_note(note: Note) -> Annotation: 68 | annotator = User( 69 | name=note.author.get("name"), 70 | email=note.author.get("email"), 71 | gitlab_username=note.author.get("username"), 72 | gitlab_id=note.author.get("id"), 73 | prov_role=ProvRole.ANNOTATOR, 74 | ) 75 | annotation_type, kwargs = classify_system_note(note.body) 76 | return Annotation( 77 | id=note.id, 78 | type=annotation_type, 79 | body=note.body, 80 | kwargs=kwargs, 81 | annotator=annotator, 82 | prov_start=note.created_at, 83 | prov_end=note.created_at, 84 | ) 85 | 86 | 87 | def parse_comment(comment: Comment) -> Annotation: 88 | annotator = User( 89 | name=comment.author.get("name"), 90 | email=comment.author.get("email"), 91 | gitlab_username=comment.author.get("username"), 92 | gitlab_id=comment.author.get("id"), 93 | prov_role=ProvRole.ANNOTATOR, 94 | ) 95 | return Annotation( 96 | id=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", 97 | type="add_comment", 98 | body=comment.note, 99 | annotator=annotator, 100 | prov_start=comment.created_at, 101 | prov_end=comment.created_at, 102 | ) 103 | 104 | 105 | def parse_note(note: Note) -> Annotation: 106 | annotator = User( 107 | name=note.author.get("name"), 108 | email=note.author.get("email"), 109 | gitlab_username=note.author.get("username"), 110 | gitlab_id=note.author.get("id"), 111 | prov_role=ProvRole.ANNOTATOR, 112 | ) 113 | return Annotation( 114 | id=note.id, 115 | type="add_note", 116 | body=note.body, 117 | annotator=annotator, 118 | prov_start=note.created_at, 119 | prov_end=note.created_at, 120 | ) 121 | 122 | 123 | def parse_award(award: AwardEmoji) -> Annotation: 124 | annotator = User( 125 | name=award.user.get("name"), 126 | email=award.user.get("email"), 127 | gitlab_username=award.user.get("username"), 128 | gitlab_id=award.user.get("id"), 129 | prov_role=ProvRole.ANNOTATOR, 130 | ) 131 | return Annotation( 132 | id=award.id, 133 | type="award_emoji", 134 | body=award.name, 135 | annotator=annotator, 136 | prov_start=award.created_at, 137 | prov_end=award.created_at, 138 | ) 139 | 140 | 141 | def parse_label(label: Label) -> Annotation: 142 | annotator = User( 143 | name=label.user.get("name"), 144 | email=label.user.get("email"), 145 | gitlab_username=label.user.get("username"), 146 | gitlab_id=label.user.get("id"), 147 | prov_role=ProvRole.ANNOTATOR, 148 | ) 149 | return Annotation( 150 | id=label.id, 151 | type=f"{label.action}_label", 152 | body=label.action, 153 | annotator=annotator, 154 | prov_start=label.created_at, 155 | prov_end=label.created_at, 156 | ) 157 | 158 | 159 | def choose_parser( 160 | parseable: Note | Comment | AwardEmoji | Label, 161 | ) -> Callable[[Note | Comment | AwardEmoji | Label], Annotation] | None: 162 | match parseable: 163 | case ProjectIssueNote(system=True) | ProjectMergeRequestNote(system=True): 164 | return parse_system_note 165 | case ProjectIssueNote() | ProjectMergeRequestNote(): 166 | return parse_note 167 | case ProjectCommitComment(): 168 | return parse_comment 169 | case ProjectIssueResourceLabelEvent() | ProjectMergeRequestResourceLabelEvent(): 170 | return parse_label 171 | case ProjectIssueAwardEmoji() | ProjectIssueNoteAwardEmoji() | ProjectMergeRequestAwardEmoji() | ProjectMergeRequestNoteAwardEmoji(): 172 | return parse_award 173 | case _: 174 | log.warning(f"no parser found for {parseable=}") 175 | return 176 | 177 | 178 | def parse_annotations( 179 | parseables: Sequence[Note | Comment | AwardEmoji | Label], 180 | ) -> Sequence[Annotation]: 181 | annotations = [] 182 | for parseable in parseables: 183 | if parser := choose_parser(parseable): 184 | annotations.append(parser(parseable)) 185 | return sorted(annotations, key=operator.attrgetter("prov_start")) 186 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/fetch/git.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from dataclasses import dataclass 3 | from tempfile import TemporaryDirectory 4 | 5 | from git import Commit 6 | from git import Repo 7 | 8 | from gitlab2prov.adapters.fetch.utils import clone_over_https_url 9 | from gitlab2prov.domain.constants import ChangeType 10 | from gitlab2prov.domain.constants import ProvRole 11 | from gitlab2prov.domain.objects import File 12 | from gitlab2prov.domain.objects import FileRevision 13 | from gitlab2prov.domain.objects import GitCommit 14 | from gitlab2prov.domain.objects import User 15 | 16 | 17 | LOG_DELIMITER = "====DELIMITER====" 18 | EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" 19 | 20 | 21 | @dataclass 22 | class GitFetcher: 23 | url: str 24 | token: str 25 | 26 | _repo: Repo | None = None 27 | _tmpdir: TemporaryDirectory | None = None 28 | 29 | def __enter__(self): 30 | self._tmpdir = TemporaryDirectory(ignore_cleanup_errors=True) 31 | return self 32 | 33 | def __exit__(self, exc_type, exc_val, exc_tb): 34 | if self._repo: 35 | self._repo.close() 36 | if self._tmpdir: 37 | self._tmpdir.cleanup() 38 | 39 | def do_clone(self) -> None: 40 | url = clone_over_https_url(self.url, self.token) 41 | self._repo = Repo.clone_from( 42 | url=url, 43 | to_path=self._tmpdir.name, 44 | ) 45 | 46 | def fetch_git(self) -> Iterator[GitCommit | File | FileRevision]: 47 | for commit in self._repo.iter_commits("--all"): 48 | yield self.git_commit_to_domain_commit(commit) 49 | for file in self.fetch_files_for_commit(commit): 50 | yield file 51 | for revision in self.fetch_revisions_for_file(file): 52 | yield revision 53 | 54 | @staticmethod 55 | def git_commit_to_domain_commit(commit: Commit) -> GitCommit: 56 | return GitCommit( 57 | hexsha=commit.hexsha, 58 | message=commit.message, 59 | title=commit.summary, 60 | author=get_author(commit), 61 | committer=get_committer(commit), 62 | parents=[parent.hexsha for parent in commit.parents], 63 | prov_start=commit.authored_datetime, 64 | prov_end=commit.committed_datetime, 65 | ) 66 | 67 | def fetch_files_for_commit(self, commit: Commit) -> Iterator[File]: 68 | # choose the parent commit to diff against 69 | # use *magic* empty tree sha for commits without parents 70 | parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA 71 | # diff against parent 72 | diff = commit.diff(parent, R=True) 73 | # only consider files that have been added to the repository 74 | # disregard modifications and deletions 75 | for diff_item in diff.iter_change_type(ChangeType.ADDED): 76 | # path for new files is stored in diff b_path 77 | yield File(path=diff_item.b_path, committed_in=commit.hexsha) 78 | 79 | def fetch_revisions_for_file(self, file: File) -> Iterator[FileRevision]: 80 | log = self._repo.git.log( 81 | "--all", 82 | "--follow", 83 | "--name-status", 84 | f"--pretty=format:{LOG_DELIMITER}%n%H", 85 | "--", 86 | file.path, 87 | ) 88 | 89 | prev_revision = None 90 | 91 | for hexsha, status, path in reversed(list(parse_log(log))): 92 | revision = FileRevision( 93 | path=path, 94 | committed_in=hexsha, 95 | change_type=status, 96 | original=file, 97 | previous=prev_revision, 98 | ) 99 | yield revision 100 | prev_revision = revision 101 | 102 | 103 | def get_author(commit: Commit) -> User: 104 | return User( 105 | name=commit.author.name, 106 | email=commit.author.email, 107 | gitlab_username=None, 108 | gitlab_id=None, 109 | prov_role=ProvRole.AUTHOR, 110 | ) 111 | 112 | 113 | def get_committer(commit: Commit) -> User: 114 | return User( 115 | name=commit.committer.name, 116 | email=commit.committer.email, 117 | gitlab_username=None, 118 | gitlab_id=None, 119 | prov_role=ProvRole.COMMITTER, 120 | ) 121 | 122 | 123 | def parse_log(log: str): 124 | """Parse 'git log' output into file paths, commit hexshas, file status (aka change type).""" 125 | # split the log into single entries using the delimiter 126 | for entry in log.split(f"{LOG_DELIMITER}\n"): 127 | # skip empty entries 128 | if not entry: 129 | continue 130 | # split the entry into lines, remove empty lines 131 | lines = [line.strip() for line in entry.split("\n") if line] 132 | # first line is always the commit hexsha 133 | hexsha = lines[0] 134 | for line in lines[1:]: 135 | # split the line by tab characters 136 | parts = line.split("\t") 137 | # status is the first character in the line 138 | status = parts[0][0] 139 | # path is always the last element when split by tab 140 | path = parts[-1] 141 | yield hexsha, status, path 142 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/fetch/gitlab.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Iterator 3 | from dataclasses import dataclass 4 | from dataclasses import field 5 | 6 | from gitlab import Gitlab 7 | from gitlab.exceptions import GitlabListError 8 | from gitlab.v4.objects import Project 9 | from gitlab.v4.objects import ProjectCommit 10 | from gitlab.v4.objects import ProjectIssue 11 | from gitlab.v4.objects import ProjectMergeRequest 12 | from gitlab.v4.objects import ProjectRelease 13 | from gitlab.v4.objects import ProjectTag 14 | 15 | from gitlab2prov.adapters.fetch.annotations import parse_annotations 16 | from gitlab2prov.adapters.fetch.utils import gitlab_url 17 | from gitlab2prov.adapters.fetch.utils import project_slug 18 | from gitlab2prov.domain.constants import ProvRole 19 | from gitlab2prov.domain.objects import Asset 20 | from gitlab2prov.domain.objects import Evidence 21 | from gitlab2prov.domain.objects import GitlabCommit 22 | from gitlab2prov.domain.objects import Issue 23 | from gitlab2prov.domain.objects import MergeRequest 24 | from gitlab2prov.domain.objects import Release 25 | from gitlab2prov.domain.objects import Tag 26 | from gitlab2prov.domain.objects import User 27 | 28 | 29 | log = logging.getLogger(__name__) 30 | 31 | 32 | @dataclass 33 | class GitlabFetcher: 34 | url: str 35 | token: str 36 | _project: Project | None = field(init=False, default=None) 37 | 38 | def do_login(self) -> None: 39 | gl = Gitlab(url=gitlab_url(self.url), private_token=self.token) 40 | self._project = gl.projects.get(project_slug(self.url)) 41 | 42 | def fetch_gitlab( 43 | self, 44 | ) -> Iterator[GitlabCommit | Issue | MergeRequest | Release | Tag]: 45 | yield from extract_commits(self._project) 46 | yield from extract_issues(self._project) 47 | yield from extract_mergerequests(self._project) 48 | yield from extract_releases(self._project) 49 | yield from extract_tags(self._project) 50 | 51 | 52 | def on_gitlab_list_error(func): 53 | def wrapped(*args, **kwargs): 54 | try: 55 | return func(*args, **kwargs) 56 | except GitlabListError as e: 57 | msg = f"{func.__module__}.{func.__name__}: {type(e)} due to {e.response_code} HTTP Error." 58 | log.info(msg) 59 | 60 | return wrapped 61 | 62 | 63 | def get_commit_author(commit: ProjectCommit) -> User: 64 | return User( 65 | name=commit.committer_name, 66 | email=commit.committer_email, 67 | gitlab_username=None, 68 | gitlab_id=None, 69 | prov_role=ProvRole.AUTHOR_GITLAB_COMMIT, 70 | ) 71 | 72 | 73 | def get_tag_author(tag: ProjectTag) -> User: 74 | return User( 75 | name=tag.commit.get("author_name"), 76 | email=tag.commit.get("author_email"), 77 | gitlab_username=None, 78 | gitlab_id=None, 79 | prov_role=ProvRole.AUTHOR_TAG, 80 | ) 81 | 82 | 83 | def get_resource_author( 84 | resource: ProjectIssue | ProjectMergeRequest | ProjectRelease, role: ProvRole 85 | ) -> User | None: 86 | if not hasattr(resource, "author"): 87 | return None 88 | return User( 89 | name=resource.author.get("name"), 90 | email=resource.author.get("email"), 91 | gitlab_username=resource.author.get("username"), 92 | gitlab_id=resource.author.get("id"), 93 | prov_role=role, 94 | ) 95 | 96 | 97 | def get_assets(release: ProjectRelease) -> list[Asset]: 98 | return [ 99 | Asset(url=asset.get("url"), format=asset.get("format")) 100 | for asset in release.assets.get("sources", []) 101 | ] 102 | 103 | 104 | def get_evidences(release: ProjectRelease) -> list[Evidence]: 105 | return [ 106 | Evidence( 107 | hexsha=evidence.get("sha"), 108 | url=evidence.get("filepath"), 109 | collected_at=evidence.get("collected_at"), 110 | ) 111 | for evidence in release.evidences 112 | ] 113 | 114 | 115 | @on_gitlab_list_error 116 | def extract_commits(project: Project) -> Iterator[GitlabCommit]: 117 | for commit in project.commits.list(all=True): 118 | parseables = { 119 | *commit.comments.list(all=True, system=False), 120 | *commit.comments.list(all=True, system=True), 121 | } 122 | yield GitlabCommit( 123 | hexsha=commit.id, 124 | url=commit.web_url, 125 | author=get_commit_author(commit), 126 | annotations=parse_annotations(parseables), 127 | authored_at=commit.authored_date, 128 | committed_at=commit.committed_date, 129 | ) 130 | 131 | 132 | @on_gitlab_list_error 133 | def extract_issues(project: Project) -> Iterator[Issue]: 134 | for issue in project.issues.list(all=True): 135 | parseables = { 136 | *issue.notes.list(all=True, system=False), 137 | *issue.notes.list(all=True, system=True), 138 | *issue.awardemojis.list(all=True), 139 | *issue.resourcelabelevents.list(all=True), 140 | *( 141 | award 142 | for note in issue.notes.list(all=True) 143 | for award in note.awardemojis.list(all=True) 144 | ), 145 | } 146 | yield Issue( 147 | id=issue.id, 148 | iid=issue.iid, 149 | title=issue.title, 150 | description=issue.description, 151 | url=issue.web_url, 152 | author=get_resource_author(issue, ProvRole.AUTHOR_ISSUE), 153 | annotations=parse_annotations(parseables), 154 | created_at=issue.created_at, 155 | closed_at=issue.closed_at, 156 | ) 157 | 158 | 159 | @on_gitlab_list_error 160 | def extract_mergerequests(project: Project) -> Iterator[MergeRequest]: 161 | for mergerequest in project.mergerequests.list(all=True): 162 | parseables = { 163 | *mergerequest.notes.list(all=True, system=False), 164 | *mergerequest.notes.list(all=True, system=True), 165 | *mergerequest.awardemojis.list(all=True), 166 | *mergerequest.resourcelabelevents.list(all=True), 167 | *( 168 | award 169 | for note in mergerequest.notes.list(all=True) 170 | for award in note.awardemojis.list(all=True) 171 | ), 172 | } 173 | yield MergeRequest( 174 | id=mergerequest.id, 175 | iid=mergerequest.iid, 176 | title=mergerequest.title, 177 | description=mergerequest.description, 178 | url=mergerequest.web_url, 179 | source_branch=mergerequest.source_branch, 180 | target_branch=mergerequest.target_branch, 181 | author=get_resource_author(mergerequest, ProvRole.AUTHOR_MERGE_REQUEST), 182 | annotations=parse_annotations(parseables), 183 | created_at=mergerequest.created_at, 184 | closed_at=mergerequest.closed_at, 185 | merged_at=mergerequest.merged_at, 186 | first_deployed_to_production_at=getattr( 187 | mergerequest, "first_deployed_to_production_at", None 188 | ), 189 | ) 190 | 191 | 192 | @on_gitlab_list_error 193 | def extract_releases(project: Project) -> Iterator[Release]: 194 | for release in project.releases.list(all=True): 195 | yield Release( 196 | name=release.name, 197 | description=release.description, 198 | tag_name=release.tag_name, 199 | author=get_resource_author(release, ProvRole.AUTHOR_RELEASE), 200 | assets=get_assets(release), 201 | evidences=get_evidences(release), 202 | created_at=release.created_at, 203 | released_at=release.released_at, 204 | ) 205 | 206 | 207 | @on_gitlab_list_error 208 | def extract_tags(project: Project) -> Iterator[Tag]: 209 | for tag in project.tags.list(all=True): 210 | yield Tag( 211 | name=tag.name, 212 | hexsha=tag.target, 213 | message=tag.message, 214 | author=get_tag_author(tag), 215 | created_at=tag.commit.get("created_at"), 216 | ) 217 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/fetch/utils.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlsplit 2 | 3 | 4 | def project_slug(url: str) -> str: 5 | path = urlsplit(url).path 6 | if path is None: 7 | return None 8 | return path.strip("/") 9 | 10 | 11 | def gitlab_url(url: str) -> str: 12 | split = urlsplit(url) 13 | return f"{split.scheme}://{split.netloc}" 14 | 15 | 16 | def clone_over_https_url(url: str, token: str) -> str: 17 | split = urlsplit(url) 18 | return f"https://gitlab.com:{token}@{split.netloc}/{project_slug(url)}" 19 | -------------------------------------------------------------------------------- /gitlab2prov/adapters/repository.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from collections import defaultdict 3 | from typing import Type, TypeVar, Optional, Any 4 | 5 | 6 | R = TypeVar("R") 7 | 8 | 9 | class AbstractRepository(abc.ABC): 10 | def add(self, resource: R) -> None: 11 | self._add(resource) 12 | 13 | def get(self, resource_type: Type[R], **filters: Any) -> Optional[R]: 14 | resource = self._get(resource_type, **filters) 15 | return resource 16 | 17 | def list_all(self, resource_type: Type[R], **filters: Any) -> list[R]: 18 | resources = self._list_all(resource_type, **filters) 19 | return resources 20 | 21 | @abc.abstractmethod 22 | def _add(self, resource: R) -> None: 23 | raise NotImplementedError 24 | 25 | @abc.abstractmethod 26 | def _get(self, resource_type: Type[R], **filters: Any) -> Optional[R]: 27 | raise NotImplementedError 28 | 29 | @abc.abstractmethod 30 | def _list_all(self, resource_type: Type[R], **filters: Any) -> list[R]: 31 | raise NotImplementedError 32 | 33 | 34 | class InMemoryRepository(AbstractRepository): 35 | # not super efficient 36 | # should be fast enough for 1.0 37 | # snychronous get requests are the main culprit in slowing runtime 38 | def __init__(self): 39 | super().__init__() 40 | self.repo = defaultdict(list) 41 | 42 | def _add(self, resource: R) -> None: 43 | self.repo[type(resource)].append(resource) 44 | 45 | def _get(self, resource_type: Type[R], **filters: Any) -> Optional[R]: 46 | return next( 47 | ( 48 | r 49 | for r in self.repo.get(resource_type, []) 50 | if all(getattr(r, key) == val for key, val in filters.items()) 51 | ), 52 | None, 53 | ) 54 | 55 | def _list_all(self, resource_type: Type[R], **filters: Any) -> list[R]: 56 | return [ 57 | r 58 | for r in self.repo.get(resource_type, []) 59 | if all(getattr(r, key) == val for key, val in filters.items()) 60 | ] 61 | -------------------------------------------------------------------------------- /gitlab2prov/bootstrap.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import logging 3 | from typing import Type 4 | 5 | from gitlab2prov.service_layer import handlers, messagebus, unit_of_work 6 | from gitlab2prov.adapters.fetch import GitFetcher, GitlabFetcher 7 | 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | def bootstrap( 13 | uow: unit_of_work.AbstractUnitOfWork = unit_of_work.InMemoryUnitOfWork(), 14 | git_fetcher: Type[GitFetcher] = GitFetcher, 15 | gitlab_fetcher: Type[GitlabFetcher] = GitlabFetcher, 16 | ): 17 | dependencies = { 18 | "uow": uow, 19 | "git_fetcher": git_fetcher, 20 | "gitlab_fetcher": gitlab_fetcher, 21 | } 22 | injected_handlers = { 23 | command_type: [inject_dependencies(handler, dependencies) for handler in handlers] 24 | for command_type, handlers in handlers.HANDLERS.items() 25 | } 26 | 27 | return messagebus.MessageBus(uow, handlers=injected_handlers) 28 | 29 | 30 | def inject_dependencies(handler, dependencies): 31 | params = inspect.signature(handler).parameters 32 | dependencies = { 33 | name: dependency for name, dependency in dependencies.items() if name in params 34 | } 35 | for name, dep in dependencies.items(): 36 | log.debug(f"inject dependency {dep} into handler {handler} as param {name}") 37 | return lambda cmd: handler(cmd, **dependencies) 38 | -------------------------------------------------------------------------------- /gitlab2prov/config/__init__.py: -------------------------------------------------------------------------------- 1 | from gitlab2prov.config.parser import ConfigParser -------------------------------------------------------------------------------- /gitlab2prov/config/parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | 4 | import jsonschema 5 | from ruamel.yaml import YAML 6 | 7 | from gitlab2prov.root import get_package_root 8 | 9 | 10 | def read_file(filepath: str) -> Any: 11 | with open(filepath, "rt") as f: 12 | yaml = YAML(typ="safe") 13 | return yaml.load(f.read()) 14 | 15 | 16 | def get_schema() -> dict[str, Any]: 17 | path = get_package_root() / "config" / "schema.json" 18 | with open(path, "rt", encoding="utf-8") as f: 19 | return json.loads(f.read()) 20 | 21 | 22 | class ConfigParser: 23 | @staticmethod 24 | def validate(filepath: str) -> None: 25 | jsonschema.validate(read_file(filepath), get_schema()) 26 | 27 | def parse(self, filepath: str) -> list[str]: 28 | content = read_file(filepath) 29 | return list(self.parse_array(content)) 30 | 31 | def parse_array(self, arr: list[Any]): 32 | for obj in arr: 33 | yield from self.parse_object(obj) 34 | 35 | def parse_object(self, obj: dict[str, Any]): 36 | cmd = list(obj.keys())[0] 37 | yield cmd 38 | yield from self.parse_options(obj[cmd]) 39 | 40 | def parse_options(self, options: dict[str, bool | str | list[str]] | None): 41 | if not options: 42 | return 43 | for name, value in options.items(): 44 | yield from self.parse_option(name, value) 45 | 46 | def parse_option(self, name: str, literal: bool | str | list[str]): 47 | match literal: 48 | case bool(): 49 | yield f"--{name}" 50 | case str(): 51 | yield f"--{name}" 52 | yield literal 53 | case list() as litlist: 54 | for lit in litlist: 55 | yield f"--{name}" 56 | yield lit 57 | case _: 58 | raise ValueError(f"Unknown literal type!") 59 | -------------------------------------------------------------------------------- /gitlab2prov/config/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema", 3 | "type": "array", 4 | "items": [ 5 | { 6 | "type": "object", 7 | "properties": { 8 | "extract": { 9 | "type": "object", 10 | "properties": { 11 | "url": { 12 | "type": "array", 13 | "items": { 14 | "format": "uri", 15 | "type": "string" 16 | } 17 | }, 18 | "token": { 19 | "type": "string" 20 | } 21 | }, 22 | "required": [ 23 | "url", 24 | "token" 25 | ] 26 | } 27 | } 28 | }, 29 | { 30 | "type": "object", 31 | "properties": { 32 | "open": { 33 | "type": "object", 34 | "properties": { 35 | "input": { 36 | "type": "array", 37 | "items": { 38 | "type": "string" 39 | } 40 | } 41 | }, 42 | "required": [ 43 | "input" 44 | ] 45 | } 46 | } 47 | }, 48 | { 49 | "type": "object", 50 | "properties": { 51 | "combine": { 52 | "type": "null" 53 | } 54 | } 55 | }, 56 | { 57 | "type": "object", 58 | "properties": { 59 | "save": { 60 | "type": "object", 61 | "properties": { 62 | "output": { 63 | "type": "string" 64 | }, 65 | "format": { 66 | "type": "array", 67 | "items": { 68 | "type": "string" 69 | } 70 | } 71 | }, 72 | "required": [ 73 | "output", 74 | "format" 75 | ] 76 | } 77 | } 78 | }, 79 | { 80 | "type": "object", 81 | "properties": { 82 | "pseudonymize": { 83 | "type": "null" 84 | } 85 | } 86 | }, 87 | { 88 | "type": "object", 89 | "properties": { 90 | "stats": { 91 | "type": "object", 92 | "properties": { 93 | "explain": { 94 | "type": "boolean" 95 | }, 96 | "fine": { 97 | "type": "boolean" 98 | }, 99 | "coarse": { 100 | "type": "boolean" 101 | }, 102 | "formatter": { 103 | "type": "string", 104 | "enum": [ 105 | "table", 106 | "csv" 107 | ] 108 | } 109 | } 110 | } 111 | } 112 | } 113 | ] 114 | } 115 | -------------------------------------------------------------------------------- /gitlab2prov/domain/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/gitlab2prov/domain/__init__.py -------------------------------------------------------------------------------- /gitlab2prov/domain/commands.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from datetime import datetime 3 | from typing import Optional 4 | 5 | 6 | @dataclass 7 | class Command: 8 | pass 9 | 10 | 11 | @dataclass 12 | class Fetch(Command): 13 | url: str 14 | token: str 15 | 16 | 17 | @dataclass 18 | class Update(Fetch): 19 | last_updated_at: datetime 20 | 21 | 22 | @dataclass 23 | class Reset(Command): 24 | pass 25 | 26 | 27 | @dataclass 28 | class Serialize(Command): 29 | pass 30 | -------------------------------------------------------------------------------- /gitlab2prov/domain/constants.py: -------------------------------------------------------------------------------- 1 | from prov.constants import PROV_ATTR_COLLECTION 2 | from prov.model import PROV_ATTR_ENDTIME 3 | from prov.model import PROV_ATTR_STARTTIME 4 | from prov.model import PROV_LABEL 5 | from prov.model import PROV_ROLE 6 | from prov.model import PROV_TYPE 7 | 8 | 9 | PROV_FIELD_MAP = { 10 | "prov_type": PROV_TYPE, 11 | "prov_role": PROV_ROLE, 12 | "prov_label": PROV_LABEL, 13 | "prov_start": PROV_ATTR_STARTTIME, 14 | "prov_end": PROV_ATTR_ENDTIME, 15 | } 16 | 17 | 18 | class ChangeType: 19 | ADDED = "A" 20 | MODIFIED = "M" 21 | DELETED = "D" 22 | RENAMED = "R" 23 | COPIED = "C" 24 | UNMERGED = "U" 25 | UNKNOWN = "X" 26 | BROKEN = "B" 27 | CHANGED = "T" 28 | 29 | 30 | class ProvRole: 31 | GIT_COMMIT = "GitCommit" 32 | COMMITTER = "Committer" 33 | AUTHOR = "Author" 34 | AUTHOR_GITLAB_COMMIT = "GitlabCommitAuthor" 35 | AUTHOR_ISSUE = "IssueAuthor" 36 | AUTHOR_MERGE_REQUEST = "MergeRequestAuthor" 37 | AUTHOR_RELEASE = "ReleaseAuthor" 38 | AUTHOR_TAG = "TagAuthor" 39 | ANNOTATOR = "Annotator" 40 | FILE = "File" 41 | FILE_REVISION_TO_BE_MODIFIED = "FileRevisionToBeModified" 42 | FILE_REVISION_AFTER_MODIFICATION = "FileRevisionAfterModification" 43 | FILE_REVISION_AT_POINT_OF_ADDITION = "FileRevisionAtPointOfAddition" 44 | FILE_REVISION_AT_POINT_OF_DELETION = "FileRevisionAtPointOfDeletion" 45 | RESOURCE = "Resource" 46 | RESOURCE_VERSION_AT_POINT_OF_CREATION = "ResourceVersionAtPointOfCreation" 47 | RESOURCE_VERSION_TO_BE_ANNOTATED = "ResourceVersionToBeAnnotated" 48 | RESOURCE_VERSION_AFTER_ANNOTATION = "ResourceVersionAfterAnnotation" 49 | RELEASE = "Release" 50 | TAG = "Tag" 51 | GitCommit = "GitCommit" 52 | 53 | 54 | class ProvType: 55 | USER = "User" 56 | GIT_COMMIT = "GitCommit" 57 | FILE = "File" 58 | FILE_REVISION = "FileRevision" 59 | GITLAB_COMMIT = "GitlabCommit" 60 | GITLAB_COMMIT_VERSION = "GitlabCommitVersion" 61 | GITLAB_COMMIT_VERSION_ANNOTATED = "AnnotatedGitlabCommitVersion" 62 | GITLAB_COMMIT_CREATION = "GitlabCommitCreation" 63 | ISSUE = "Issue" 64 | ISSUE_VERSION = "IssueVersion" 65 | ISSUE_VERSION_ANNOTATED = "AnnotatedIssueVersion" 66 | ISSUE_CREATION = "IssueCreation" 67 | MERGE_REQUEST = "MergeRequest" 68 | MERGE_REQUEST_VERSION = "MergeRequestVersion" 69 | MERGE_REQUEST_VERSION_ANNOTATED = "AnnotatedMergeRequestVersion" 70 | MERGE_REQUEST_CREATION = "MergeRequestCreation" 71 | ANNOTATION = "Annotation" 72 | TAG = "Tag" 73 | TAG_CREATION = "TagCreation" 74 | RELEASE = "Release" 75 | RELEASE_CREATION = "ReleaseCreation" 76 | ASSET = "Asset" 77 | EVIDENCE = "Evidence" 78 | COLLECTION = PROV_ATTR_COLLECTION 79 | -------------------------------------------------------------------------------- /gitlab2prov/domain/objects.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass 4 | from dataclasses import Field 5 | from dataclasses import field 6 | from dataclasses import fields 7 | from datetime import datetime 8 | from itertools import cycle 9 | from typing import Any 10 | from urllib.parse import urlencode 11 | 12 | from prov.identifier import QualifiedName 13 | from prov.model import PROV_LABEL 14 | 15 | from gitlab2prov.domain.constants import PROV_FIELD_MAP 16 | from gitlab2prov.domain.constants import ProvRole 17 | from gitlab2prov.domain.constants import ProvType 18 | from gitlab2prov.prov.operations import qualified_name 19 | 20 | 21 | # metadata for dataclass attributes that relate objects with one another 22 | # such attributes will not be included in the list of prov attributes of a dataclass 23 | IS_RELATION = {"IS_RELATION": True} 24 | 25 | 26 | def is_relation(field: Field): 27 | return field.metadata == IS_RELATION 28 | 29 | 30 | class ProvMixin: 31 | @property 32 | def prov_identifier(self) -> QualifiedName: 33 | attrs = urlencode(dict(self._traverse_repr_fields())) 34 | label = f"{self._prov_type()}?{attrs}" 35 | return qualified_name(label) 36 | 37 | @property 38 | def prov_label(self) -> QualifiedName: 39 | attrs = urlencode(dict(self._traverse_repr_fields())) 40 | label = f"{self._prov_type()}?{attrs}" 41 | return qualified_name(label) 42 | 43 | @property 44 | def prov_attributes(self) -> list[tuple[str, str | int | datetime | None]]: 45 | return list(self._traverse_attributes()) 46 | 47 | def _prov_type(self) -> str: 48 | match self.prov_type: 49 | case list(): 50 | return self.prov_type[0] 51 | case _: 52 | return self.prov_type 53 | 54 | def _traverse_repr_fields(self): 55 | for f in fields(self): 56 | if f.repr: 57 | yield f.name, getattr(self, f.name) 58 | 59 | def _traverse_attributes(self): 60 | for f in fields(self): 61 | if not is_relation(f): 62 | yield from self._expand_attribute(f.name, getattr(self, f.name)) 63 | yield (PROV_LABEL, self.prov_label) 64 | 65 | def _expand_attribute(self, key, val): 66 | key = PROV_FIELD_MAP.get(key, key) 67 | match val: 68 | case list(): 69 | yield from zip(cycle([key]), val) 70 | case dict(): 71 | yield from val.items() 72 | case _: 73 | yield key, val 74 | 75 | 76 | @dataclass 77 | class AgentMixin: 78 | def __iter__(self): 79 | yield self.prov_identifier 80 | yield self.prov_attributes 81 | 82 | 83 | @dataclass 84 | class EntityMixin: 85 | def __iter__(self): 86 | yield self.prov_identifier 87 | yield self.prov_attributes 88 | 89 | 90 | @dataclass(kw_only=True) 91 | class ActivityMixin: 92 | def __iter__(self): 93 | yield self.prov_identifier 94 | yield self.prov_start 95 | yield self.prov_end 96 | yield self.prov_attributes 97 | 98 | 99 | @dataclass(unsafe_hash=True, kw_only=True) 100 | class User(ProvMixin, AgentMixin): 101 | name: str 102 | email: str | None = field(default=None) 103 | gitlab_username: str | None = field(repr=False, default=None) 104 | gitlab_id: str | None = field(repr=False, default=None) 105 | prov_role: ProvRole = field(repr=False, default=None) 106 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.USER) 107 | 108 | def __post_init__(self): 109 | self.email = self.email.lower() if self.email else None 110 | 111 | 112 | @dataclass(unsafe_hash=True, kw_only=True) 113 | class File(ProvMixin, EntityMixin): 114 | path: str 115 | committed_in: str 116 | prov_type: str = field(init=False, repr=False, default=ProvType.FILE) 117 | 118 | 119 | @dataclass(unsafe_hash=True, kw_only=True) 120 | class FileRevision(ProvMixin, EntityMixin): 121 | path: str 122 | committed_in: str 123 | change_type: str 124 | original: File = field(repr=False, metadata=IS_RELATION) 125 | previous: FileRevision | None = field(repr=False, default=None, metadata=IS_RELATION) 126 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.FILE_REVISION) 127 | 128 | 129 | @dataclass(unsafe_hash=True, kw_only=True) 130 | class Annotation(ProvMixin, ActivityMixin): 131 | id: str 132 | type: str 133 | body: str = field(repr=False) 134 | kwargs: dict[str, Any] = field(repr=False, default_factory=dict) 135 | annotator: User = field(repr=False, metadata=IS_RELATION) 136 | prov_start: datetime = field(repr=False) 137 | prov_end: datetime = field(repr=False) 138 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.ANNOTATION) 139 | 140 | 141 | @dataclass(unsafe_hash=True, kw_only=True) 142 | class Version(ProvMixin, EntityMixin): 143 | version_id: str 144 | prov_type: ProvType = field(repr=False) 145 | 146 | 147 | @dataclass(unsafe_hash=True, kw_only=True) 148 | class AnnotatedVersion(ProvMixin, EntityMixin): 149 | version_id: str 150 | annotation_id: str 151 | prov_type: ProvType = field(repr=False) 152 | 153 | 154 | @dataclass(unsafe_hash=True, kw_only=True) 155 | class Creation(ProvMixin, ActivityMixin): 156 | creation_id: str 157 | prov_start: datetime = field(repr=False) 158 | prov_end: datetime = field(repr=False) 159 | prov_type: ProvType = field(repr=False) 160 | 161 | 162 | @dataclass(unsafe_hash=True, kw_only=True) 163 | class GitCommit(ProvMixin, ActivityMixin): 164 | hexsha: str 165 | message: str = field(repr=False) 166 | title: str = field(repr=False) 167 | author: User = field(repr=False, metadata=IS_RELATION) 168 | committer: User = field(repr=False, metadata=IS_RELATION) 169 | parents: list[str] = field(repr=False, metadata=IS_RELATION) 170 | prov_start: datetime = field(repr=False) 171 | prov_end: datetime = field(repr=False) 172 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.GIT_COMMIT) 173 | 174 | 175 | @dataclass(unsafe_hash=True, kw_only=True) 176 | class Issue(ProvMixin, EntityMixin): 177 | id: str 178 | iid: str 179 | title: str 180 | description: str = field(repr=False) 181 | url: str = field(repr=False) 182 | author: User = field(repr=False, metadata=IS_RELATION) 183 | annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) 184 | created_at: datetime = field(repr=False) 185 | closed_at: datetime | None = field(repr=False, default=None) 186 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.ISSUE) 187 | 188 | @property 189 | def creation(self) -> Creation: 190 | return Creation( 191 | creation_id=self.id, 192 | prov_start=self.created_at, 193 | prov_end=self.closed_at, 194 | prov_type=ProvType.ISSUE_CREATION, 195 | ) 196 | 197 | @property 198 | def first_version(self) -> Version: 199 | return Version(version_id=self.id, prov_type=ProvType.ISSUE_VERSION) 200 | 201 | @property 202 | def annotated_versions(self) -> list[AnnotatedVersion]: 203 | return [ 204 | AnnotatedVersion( 205 | version_id=self.id, 206 | annotation_id=annotation.id, 207 | prov_type=ProvType.ISSUE_VERSION_ANNOTATED, 208 | ) 209 | for annotation in self.annotations 210 | ] 211 | 212 | 213 | @dataclass(unsafe_hash=True, kw_only=True) 214 | class GitlabCommit(ProvMixin, EntityMixin): 215 | hexsha: str 216 | url: str = field(repr=False) 217 | author: User = field(repr=False, metadata=IS_RELATION) 218 | annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) 219 | authored_at: datetime = field(repr=False) 220 | committed_at: datetime = field(repr=False) 221 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.GITLAB_COMMIT) 222 | 223 | @property 224 | def creation(self) -> Creation: 225 | return Creation( 226 | creation_id=self.hexsha, 227 | prov_start=self.authored_at, 228 | prov_end=self.committed_at, 229 | prov_type=ProvType.GITLAB_COMMIT_CREATION, 230 | ) 231 | 232 | @property 233 | def first_version(self) -> Version: 234 | return Version(version_id=self.hexsha, prov_type=ProvType.GITLAB_COMMIT_VERSION) 235 | 236 | @property 237 | def annotated_versions(self) -> list[AnnotatedVersion]: 238 | return [ 239 | AnnotatedVersion( 240 | version_id=self.hexsha, 241 | annotation_id=annotation.id, 242 | prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, 243 | ) 244 | for annotation in self.annotations 245 | ] 246 | 247 | 248 | @dataclass(unsafe_hash=True, kw_only=True) 249 | class MergeRequest(ProvMixin, EntityMixin): 250 | id: str 251 | iid: str 252 | title: str 253 | description: str = field(repr=False) 254 | url: str = field(repr=False) 255 | source_branch: str = field(repr=False) 256 | target_branch: str = field(repr=False) 257 | author: User = field(repr=False, metadata=IS_RELATION) 258 | annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) 259 | created_at: datetime = field(repr=False) 260 | closed_at: datetime | None = field(repr=False, default=None) 261 | merged_at: datetime | None = field(repr=False, default=None) 262 | first_deployed_to_production_at: datetime | None = field(repr=False, default=None) 263 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.MERGE_REQUEST) 264 | 265 | @property 266 | def creation(self) -> Creation: 267 | return Creation( 268 | creation_id=self.id, 269 | prov_start=self.created_at, 270 | prov_end=self.closed_at, 271 | prov_type=ProvType.MERGE_REQUEST_CREATION, 272 | ) 273 | 274 | @property 275 | def first_version(self) -> Version: 276 | return Version(version_id=self.id, prov_type=ProvType.MERGE_REQUEST_VERSION) 277 | 278 | @property 279 | def annotated_versions(self) -> list[AnnotatedVersion]: 280 | return [ 281 | AnnotatedVersion( 282 | version_id=self.id, 283 | annotation_id=annotation.id, 284 | prov_type=ProvType.MERGE_REQUEST_VERSION_ANNOTATED, 285 | ) 286 | for annotation in self.annotations 287 | ] 288 | 289 | 290 | @dataclass(unsafe_hash=True, kw_only=True) 291 | class Tag(ProvMixin, EntityMixin): 292 | name: str 293 | hexsha: str 294 | message: str | None = field(repr=False) 295 | author: User = field(repr=False, metadata=IS_RELATION) 296 | created_at: datetime = field(repr=False) 297 | prov_type: list[ProvType] = field( 298 | init=False, 299 | repr=False, 300 | default_factory=lambda: [ProvType.TAG, ProvType.COLLECTION], 301 | ) 302 | 303 | @property 304 | def creation(self) -> Creation: 305 | return Creation( 306 | creation_id=self.name, 307 | prov_start=self.created_at, 308 | prov_end=self.created_at, 309 | prov_type=ProvType.TAG_CREATION, 310 | ) 311 | 312 | 313 | @dataclass(unsafe_hash=True, kw_only=True) 314 | class Asset(ProvMixin, EntityMixin): 315 | url: str 316 | format: str 317 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.ASSET) 318 | 319 | 320 | @dataclass(unsafe_hash=True, kw_only=True) 321 | class Evidence(ProvMixin, EntityMixin): 322 | hexsha: str 323 | url: str 324 | collected_at: datetime 325 | prov_type: ProvType = field(init=False, repr=False, default=ProvType.EVIDENCE) 326 | 327 | 328 | @dataclass(unsafe_hash=True, kw_only=True) 329 | class Release(ProvMixin, EntityMixin): 330 | name: str 331 | description: str = field(repr=False) 332 | tag_name: str = field(repr=False) 333 | author: User | None = field(repr=False, metadata=IS_RELATION) 334 | assets: list[Asset] = field(repr=False, metadata=IS_RELATION) 335 | evidences: list[Evidence] = field(repr=False, metadata=IS_RELATION) 336 | created_at: datetime = field(repr=False) 337 | released_at: datetime = field(repr=False) 338 | prov_type: list[ProvType] = field( 339 | init=False, 340 | repr=False, 341 | default_factory=lambda: [ProvType.RELEASE, ProvType.COLLECTION], 342 | ) 343 | 344 | @property 345 | def creation(self) -> Creation: 346 | return Creation( 347 | creation_id=self.name, 348 | prov_start=self.created_at, 349 | prov_end=self.released_at, 350 | prov_type=ProvType.RELEASE_CREATION, 351 | ) 352 | -------------------------------------------------------------------------------- /gitlab2prov/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/gitlab2prov/entrypoints/__init__.py -------------------------------------------------------------------------------- /gitlab2prov/entrypoints/cli.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from functools import update_wrapper 3 | from functools import wraps 4 | 5 | import click 6 | import git 7 | 8 | from gitlab2prov import __version__ 9 | from gitlab2prov import bootstrap 10 | from gitlab2prov.config import ConfigParser 11 | from gitlab2prov.domain import commands 12 | from gitlab2prov.log import create_logger 13 | from gitlab2prov.prov import operations 14 | 15 | 16 | def is_git_available(): 17 | """Check whether git is installed using the GitPython package.""" 18 | try: 19 | git.Git().execute(["git", "--version"]) 20 | return True 21 | except git.exc.GitCommandNotFound: 22 | return False 23 | 24 | 25 | def enable_logging(ctx: click.Context, _, enable: bool): 26 | """Callback that optionally enables logging.""" 27 | if enable: 28 | create_logger() 29 | 30 | 31 | def invoke_from_config(ctx: click.Context, _, filepath: str): 32 | """Callback that executes a gitlab2prov run from a config file.""" 33 | if filepath: 34 | args = ConfigParser().parse(filepath) 35 | context = cli.make_context(f"{cli}", args=args, parent=ctx) 36 | cli.invoke(context) 37 | ctx.exit() 38 | 39 | 40 | def validate_config(ctx: click.Context, _, filepath: str): 41 | """Callback that validates config file using gitlab2prov/config/schema.json.""" 42 | if filepath: 43 | try: 44 | ConfigParser().validate(filepath) 45 | print(ConfigParser().parse(filepath)) 46 | except Exception as err: 47 | ctx.fail(f"validation failed: {err}") 48 | click.echo(f"-- OK --") 49 | ctx.exit() 50 | 51 | 52 | def processor(func, wrapped=None): 53 | """Helper decorator to rewrite a function so that it returns another 54 | function from it. 55 | """ 56 | 57 | @wraps(wrapped or func) 58 | def new_func(*args, **kwargs): 59 | def processor(stream): 60 | return func(stream, *args, **kwargs) 61 | 62 | return processor 63 | 64 | return update_wrapper(new_func, func) 65 | 66 | 67 | def generator(func): 68 | """Similar to the :func:`processor` but passes through old values 69 | unchanged and does not pass through the values as parameter.""" 70 | 71 | @partial(processor, wrapped=func) 72 | def new_func(stream, *args, **kwargs): 73 | yield from stream 74 | yield from func(*args, **kwargs) 75 | 76 | return update_wrapper(new_func, func) 77 | 78 | 79 | @click.group(chain=True, invoke_without_command=False) 80 | @click.version_option(version=__version__, prog_name="gitlab2prov") 81 | @click.option( 82 | "--verbose", 83 | is_flag=True, 84 | is_eager=True, 85 | default=False, 86 | expose_value=False, 87 | callback=enable_logging, 88 | help="Enable logging to 'gitlab2prov.log'.", 89 | ) 90 | @click.option( 91 | "--config", 92 | type=click.Path(exists=True, dir_okay=False), 93 | expose_value=False, 94 | callback=invoke_from_config, 95 | help="Read config from file.", 96 | ) 97 | @click.option( 98 | "--validate", 99 | is_eager=True, 100 | type=click.Path(exists=True, dir_okay=False), 101 | expose_value=False, 102 | callback=validate_config, 103 | help="Validate config file and exit.", 104 | ) 105 | @click.pass_context 106 | def cli(ctx): 107 | """ 108 | Extract provenance information from GitLab projects. 109 | """ 110 | if not is_git_available(): 111 | ctx.fail("Could not find git. Please install git.") 112 | ctx.obj = bootstrap.bootstrap() 113 | 114 | 115 | @cli.result_callback() 116 | def process_commands(processors, **kwargs): 117 | """This result callback is invoked with an iterable of all the chained 118 | subcommands. As each subcommand returns a function 119 | we can chain them together to feed one into the other, similar to how 120 | a pipe on unix works. 121 | """ 122 | # Start with an empty iterable. 123 | stream = () 124 | 125 | # Pipe it through all stream processors. 126 | for processor in processors: 127 | stream = processor(stream) 128 | 129 | # Evaluate the stream and throw away the items. 130 | for _ in stream: 131 | pass 132 | 133 | 134 | @cli.command("extract") 135 | @click.option( 136 | "-u", "--url", "urls", multiple=True, type=str, required=True, help="Project url[s]." 137 | ) 138 | @click.option("-t", "--token", required=True, type=str, help="Gitlab API token.") 139 | @click.pass_obj 140 | @generator 141 | def do_extract(bus, urls: list[str], token: str): 142 | """Extract provenance information for one or more gitlab projects. 143 | 144 | This command extracts provenance information from one or multiple gitlab projects. 145 | The extracted provenance is returned as a combined provenance graph. 146 | """ 147 | for url in urls: 148 | bus.handle(commands.Fetch(url, token)) 149 | 150 | graph = bus.handle(commands.Serialize()) 151 | graph.description = f"graph extracted from '{', '.join(urls)}'" 152 | yield graph 153 | 154 | bus.handle(commands.Reset()) 155 | 156 | 157 | @cli.command("load", short_help="Load provenance files.") 158 | @click.option( 159 | "-i", 160 | "--input", 161 | multiple=True, 162 | type=click.Path(exists=True, dir_okay=False), 163 | help="Provenance file path (specify '-' to read from ).", 164 | ) 165 | @generator 166 | def load(input): 167 | """Load provenance information from a file. 168 | 169 | This command reads one provenance graph from a file or multiple graphs from multiple files. 170 | """ 171 | for filepath in input: 172 | try: 173 | if filepath == "-": 174 | graph = operations.deserialize_graph() 175 | graph.description = f"''" 176 | yield graph 177 | else: 178 | graph = operations.deserialize_graph(filepath) 179 | graph.description = f"'{filepath}'" 180 | yield graph 181 | except Exception as e: 182 | click.echo(f"Could not open '{filepath}': {e}", err=True) 183 | 184 | 185 | @cli.command("save") 186 | @click.option( 187 | "-f", 188 | "--format", 189 | multiple=True, 190 | default=["json"], 191 | type=click.Choice(operations.SERIALIZATION_FORMATS), 192 | help="Serialization format.", 193 | ) 194 | @click.option( 195 | "-o", 196 | "--output", 197 | default="gitlab2prov-graph-{:04}", 198 | help="Output file path.", 199 | ) 200 | @processor 201 | def save(graphs, format, output): 202 | """Save provenance information to a file. 203 | 204 | This command writes each provenance graph that is piped to it to a file. 205 | """ 206 | for idx, graph in enumerate(graphs, start=1): 207 | for fmt in format: 208 | try: 209 | serialized = operations.serialize_graph(graph, fmt) 210 | if output == "-": 211 | click.echo(serialized) 212 | else: 213 | with open(f"{output.format(idx)}.{fmt}", "w") as out: 214 | click.echo(serialized, file=out) 215 | except Exception as e: 216 | click.echo(f"Could not save {graph.description}: {e}", err=True) 217 | yield graph 218 | 219 | 220 | @cli.command("pseudonymize") 221 | @processor 222 | def pseudonymize(graphs): 223 | """Pseudonymize a provenance graph. 224 | 225 | This command pseudonymizes each provenance graph that is piped to it. 226 | """ 227 | for graph in graphs: 228 | try: 229 | pseud = operations.pseudonymize(graph) 230 | pseud.description = f"pseudonymized {graph.description}" 231 | yield pseud 232 | except Exception as e: 233 | click.echo(f"Could not pseudonymize {graph.description}: {e}", err=True) 234 | 235 | 236 | @cli.command("combine") 237 | @processor 238 | def combine(graphs): 239 | """Combine multiple graphs into one. 240 | 241 | This command combines all graphs that are piped to it into one. 242 | """ 243 | graphs = list(graphs) 244 | try: 245 | combined = operations.combine(iter(graphs)) 246 | descriptions = ", ".join(graph.description for graph in graphs) 247 | combined.description = f"combination of {descriptions}" 248 | yield combined 249 | except Exception as e: 250 | descriptions = "with ".join(graph.description for graph in graphs) 251 | click.echo(f"Could not combine {descriptions}: {e}", err=True) 252 | 253 | 254 | @cli.command("stats") 255 | @click.option( 256 | "--coarse", 257 | "resolution", 258 | flag_value="coarse", 259 | default=True, 260 | help="Print the number of PROV elements aswell as the overall number of relations.", 261 | ) 262 | @click.option( 263 | "--fine", 264 | "resolution", 265 | flag_value="fine", 266 | help="Print the number of PROV elements aswell as the number of PROV relations for each relation type.", 267 | ) 268 | @click.option( 269 | "--explain", 270 | "show_description", 271 | is_flag=True, 272 | help="Print a textual summary of all operations applied to the graphs.", 273 | ) 274 | @click.option("--formatter", type=click.Choice(["csv", "table"]), default="table") 275 | @processor 276 | def stats(graphs, resolution, show_description, formatter): 277 | """Print statistics such as node counts and relation counts. 278 | 279 | This command prints statistics for each processed provenance graph. 280 | Statistics include the number of elements for each element type aswell as the number of relations for each relation type. 281 | Optionally, a short textual summary of all operations applied to the processed graphs can be printed to stdout. 282 | """ 283 | for graph in graphs: 284 | try: 285 | if show_description: 286 | click.echo(f"\nDescription: {graph.description.capitalize()}\n") 287 | click.echo( 288 | operations.stats( 289 | graph, 290 | resolution, 291 | formatter=operations.format_stats_as_ascii_table 292 | if formatter == "table" 293 | else operations.format_stats_as_csv, 294 | ) 295 | ) 296 | yield graph 297 | except Exception as e: 298 | click.echo(f"Could not display stats for {graph.description}: {e}", err=True) 299 | 300 | 301 | @cli.command() 302 | @click.option( 303 | "--mapping", 304 | type=click.Path(exists=True, dir_okay=False), 305 | help="File path to duplicate agent mapping.", 306 | ) 307 | @processor 308 | def merge_duplicated_agents(graphs, mapping): 309 | """Merge duplicated agents based on a name to aliases mapping. 310 | 311 | This command solves the problem of duplicated agents that can occur when the same physical user 312 | uses different user names and emails for his git and gitlab account. 313 | Based on a mapping of names to aliases the duplicated agents can be merged. 314 | """ 315 | for graph in graphs: 316 | graph = operations.merge_duplicated_agents(graph, mapping) 317 | graph.description += f"merged double agents {graph.description}" 318 | yield graph 319 | -------------------------------------------------------------------------------- /gitlab2prov/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | LEVEL = logging.DEBUG 5 | FORMAT = "[%(asctime)s] %(levelname)s :: %(filename)s :: %(funcName)s :: %(message)s" 6 | 7 | 8 | def create_logger(): 9 | logging.basicConfig( 10 | level=LEVEL, 11 | format=FORMAT, 12 | filename="gitlab2prov.log", 13 | filemode="a", 14 | ) 15 | -------------------------------------------------------------------------------- /gitlab2prov/prov/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/gitlab2prov/prov/__init__.py -------------------------------------------------------------------------------- /gitlab2prov/prov/model.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from prov.model import ProvDocument, PROV_ROLE 4 | 5 | from gitlab2prov.prov.operations import graph_factory 6 | from gitlab2prov.adapters.repository import AbstractRepository 7 | from gitlab2prov.domain.constants import ChangeType, ProvRole 8 | from gitlab2prov.domain.objects import ( 9 | FileRevision, 10 | GitCommit, 11 | GitlabCommit, 12 | Issue, 13 | MergeRequest, 14 | Release, 15 | Tag, 16 | ) 17 | 18 | 19 | Resource = Union[GitlabCommit, Issue, MergeRequest] 20 | 21 | 22 | def git_commit_model(resources: AbstractRepository, graph: ProvDocument = None): 23 | """Commit model implementation.""" 24 | if graph is None: 25 | graph = graph_factory() 26 | for commit in resources.list_all(GitCommit): 27 | file_revisions = resources.list_all(FileRevision, committed_in=commit.hexsha) 28 | parents = [resources.get(GitCommit, hexsha=hexsha) for hexsha in commit.parents] 29 | parents = [parent for parent in parents if parent is not None] 30 | for rev in file_revisions: 31 | model = choose_rev_model(rev) 32 | if model is None: 33 | continue 34 | graph.update(model(commit, parents, rev)) 35 | return graph 36 | 37 | 38 | def choose_rev_model(rev: FileRevision): 39 | """Add the file change models based on the change type of each file version.""" 40 | if rev.change_type == ChangeType.ADDED: 41 | return addition 42 | if ( 43 | rev.change_type == ChangeType.MODIFIED 44 | or rev.change_type == ChangeType.RENAMED 45 | or rev.change_type == ChangeType.COPIED 46 | or rev.change_type == ChangeType.CHANGED 47 | ): 48 | return modification 49 | if rev.change_type == ChangeType.DELETED: 50 | return deletion 51 | return None 52 | 53 | 54 | def addition( 55 | commit: GitCommit, 56 | parents: list[GitCommit], 57 | rev: FileRevision, 58 | graph: ProvDocument = None, 59 | ): 60 | """Add model for the addition of a new file in a commit.""" 61 | if graph is None: 62 | graph = graph_factory() 63 | c = graph.activity(*commit) 64 | at = graph.agent(*commit.author) 65 | ct = graph.agent(*commit.committer) 66 | 67 | c.wasAssociatedWith( 68 | at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] 69 | ) 70 | c.wasAssociatedWith( 71 | ct, plan=None, attributes=[(PROV_ROLE, list(ct.get_attribute(PROV_ROLE))[0])] 72 | ) 73 | 74 | for parent in parents: 75 | graph.activity(*commit).wasInformedBy(graph.activity(*parent)) 76 | 77 | f = graph.entity(*rev.original) 78 | f.wasAttributedTo(at) 79 | f.wasGeneratedBy(c, time=c.get_startTime(), attributes=[(PROV_ROLE, ProvRole.FILE)]) 80 | 81 | rev = graph.entity(*rev) 82 | rev.wasAttributedTo(at) 83 | rev.specializationOf(f) 84 | rev.wasGeneratedBy( 85 | c, 86 | time=c.get_startTime(), 87 | attributes=[(PROV_ROLE, ProvRole.FILE_REVISION_AT_POINT_OF_ADDITION)], 88 | ) 89 | return graph 90 | 91 | 92 | def modification( 93 | commit: GitCommit, 94 | parents: list[GitCommit], 95 | fv: FileRevision, 96 | graph: ProvDocument = None, 97 | ): 98 | if graph is None: 99 | graph = graph_factory() 100 | c = graph.activity(*commit) 101 | at = graph.agent(*commit.author) 102 | ct = graph.agent(*commit.committer) 103 | 104 | c.wasAssociatedWith( 105 | at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] 106 | ) 107 | c.wasAssociatedWith( 108 | ct, plan=None, attributes=[(PROV_ROLE, list(ct.get_attribute(PROV_ROLE))[0])] 109 | ) 110 | 111 | for parent in parents: 112 | graph.activity(*commit).wasInformedBy(graph.activity(*parent)) 113 | 114 | f = graph.entity(*fv.original) 115 | rev = graph.entity(*fv) 116 | rev.wasAttributedTo(at) 117 | rev.specializationOf(f) 118 | rev.wasGeneratedBy( 119 | c, 120 | time=c.get_startTime(), 121 | attributes=[(PROV_ROLE, ProvRole.FILE_REVISION_AFTER_MODIFICATION)], 122 | ) 123 | 124 | # skip previous revisions if none exist 125 | if fv.previous is None: 126 | return graph 127 | 128 | prev = graph.entity(*fv.previous) 129 | prev.specializationOf(f) 130 | graph.wasRevisionOf(rev, prev) # NOTE: rev.wasRevisionOf(prev) is not impl in prov pkg 131 | c.used( 132 | prev, 133 | c.get_startTime(), 134 | [(PROV_ROLE, ProvRole.FILE_REVISION_TO_BE_MODIFIED)], 135 | ) 136 | return graph 137 | 138 | 139 | def deletion( 140 | commit: GitCommit, 141 | parents: list[GitCommit], 142 | fv: FileRevision, 143 | graph: ProvDocument = None, 144 | ): 145 | if graph is None: 146 | graph = graph_factory() 147 | c = graph.activity(*commit) 148 | at = graph.agent(*commit.author) 149 | ct = graph.agent(*commit.committer) 150 | 151 | c.wasAssociatedWith( 152 | at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] 153 | ) 154 | c.wasAssociatedWith( 155 | ct, plan=None, attributes=[(PROV_ROLE, list(ct.get_attribute(PROV_ROLE))[0])] 156 | ) 157 | 158 | for parent in parents: 159 | graph.activity(*commit).wasInformedBy(graph.activity(*parent)) 160 | 161 | f = graph.entity(*fv.original) 162 | rev = graph.entity(*fv) 163 | rev.specializationOf(f) 164 | rev.wasInvalidatedBy( 165 | c, 166 | c.get_startTime(), 167 | [(PROV_ROLE, ProvRole.FILE_REVISION_AT_POINT_OF_DELETION)], 168 | ) 169 | return graph 170 | 171 | 172 | def gitlab_commit_model(resources, graph: ProvDocument = None): 173 | if graph is None: 174 | graph = graph_factory() 175 | for gitlab_commit in resources.list_all(GitlabCommit): 176 | git_commit = resources.get(GitCommit, hexsha=gitlab_commit.hexsha) 177 | graph.update(commit_creation(gitlab_commit, git_commit)) 178 | graph.update(annotation_chain(gitlab_commit)) 179 | return graph 180 | return graph 181 | 182 | 183 | def gitlab_issue_model(resources, graph: ProvDocument = None): 184 | if graph is None: 185 | graph = graph_factory() 186 | for issue in resources.list_all(Issue): 187 | graph.update(resource_creation(issue)) 188 | graph.update(annotation_chain(issue)) 189 | return graph 190 | 191 | 192 | def gitlab_merge_request_model(resources, graph: ProvDocument = None): 193 | if graph is None: 194 | graph = graph_factory() 195 | for merge_request in resources.list_all(MergeRequest): 196 | graph.update(resource_creation(merge_request)) 197 | graph.update(annotation_chain(merge_request)) 198 | return graph 199 | 200 | 201 | def commit_creation( 202 | gitlab_commit: GitlabCommit, 203 | git_commit: Optional[GitCommit], 204 | graph: ProvDocument = None, 205 | ): 206 | if graph is None: 207 | graph = graph_factory() 208 | resource = graph.entity(*gitlab_commit) 209 | creation = graph.activity(*gitlab_commit.creation) 210 | first_version = graph.entity(*gitlab_commit.first_version) 211 | author = graph.agent(*gitlab_commit.author) 212 | 213 | resource.wasAttributedTo(author) 214 | creation.wasAssociatedWith( 215 | author, plan=None, attributes=[(PROV_ROLE, ProvRole.AUTHOR_GITLAB_COMMIT)] 216 | ) 217 | resource.wasGeneratedBy( 218 | creation, 219 | time=creation.get_startTime(), 220 | attributes=[(PROV_ROLE, ProvRole.RESOURCE)], 221 | ) 222 | first_version.wasGeneratedBy( 223 | creation, 224 | time=creation.get_startTime(), 225 | attributes=[(PROV_ROLE, ProvRole.RESOURCE_VERSION_AT_POINT_OF_CREATION)], 226 | ) 227 | first_version.specializationOf(resource) 228 | first_version.wasAttributedTo(author) 229 | 230 | if git_commit is None: 231 | return graph 232 | 233 | commit = graph.activity(*git_commit) 234 | committer = graph.agent(*git_commit.committer) 235 | commit.wasAssociatedWith(committer, plan=None, attributes=[(PROV_ROLE, ProvRole.COMMITTER)]) 236 | creation.wasInformedBy(commit) 237 | 238 | return graph 239 | 240 | 241 | def resource_creation(resource: Resource, graph: ProvDocument = None): 242 | if graph is None: 243 | graph = graph_factory() 244 | r = graph.entity(*resource) 245 | c = graph.activity(*resource.creation) 246 | rv = graph.entity(*resource.first_version) 247 | at = graph.agent(*resource.author) 248 | 249 | c.wasAssociatedWith( 250 | at, 251 | plan=None, 252 | attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])], 253 | ) 254 | 255 | r.wasAttributedTo(at) 256 | rv.wasAttributedTo(at) 257 | rv.specializationOf(r) 258 | r.wasGeneratedBy( 259 | c, 260 | time=c.get_startTime(), 261 | attributes=[(PROV_ROLE, ProvRole.RESOURCE)], 262 | ) 263 | rv.wasGeneratedBy( 264 | c, 265 | time=c.get_startTime(), 266 | attributes=[(PROV_ROLE, ProvRole.RESOURCE_VERSION_AT_POINT_OF_CREATION)], 267 | ) 268 | return graph 269 | 270 | 271 | def annotation_chain(resource, graph=None): 272 | if graph is None: 273 | graph = graph_factory() 274 | r = graph.entity(*resource) 275 | c = graph.activity(*resource.creation) 276 | fv = graph.entity(*resource.first_version) 277 | 278 | prev_annot = c 279 | prev_annot_ver = fv 280 | 281 | for annotation, annotated_version in zip(resource.annotations, resource.annotated_versions): 282 | annot = graph.activity(*annotation) 283 | annot_ver = graph.entity(*annotated_version) 284 | annotator = graph.agent(*annotation.annotator) 285 | 286 | annot.wasInformedBy(prev_annot) 287 | annot_ver.wasDerivedFrom(prev_annot_ver) 288 | annot_ver.wasAttributedTo(annotator) 289 | annot_ver.specializationOf(r) 290 | 291 | annot.wasAssociatedWith( 292 | annotator, 293 | plan=None, 294 | attributes=[(PROV_ROLE, list(annotator.get_attribute(PROV_ROLE))[0])], 295 | ) 296 | 297 | annot.used( 298 | prev_annot_ver, 299 | annot.get_startTime(), 300 | [(PROV_ROLE, list(annotator.get_attribute(PROV_ROLE))[0])], 301 | ) 302 | annot_ver.wasGeneratedBy( 303 | annot, 304 | time=annot.get_startTime(), 305 | attributes=[(PROV_ROLE, ProvRole.RESOURCE_VERSION_AFTER_ANNOTATION)], 306 | ) 307 | prev_annot = annot 308 | prev_annot_ver = annot_ver 309 | return graph 310 | 311 | 312 | def gitlab_release_tag_model(resources, graph: ProvDocument = None): 313 | if graph is None: 314 | graph = graph_factory() 315 | for tag in resources.list_all(Tag): 316 | release = resources.get(Release, tag_name=tag.name) 317 | commit = resources.get(GitlabCommit, hexsha=tag.hexsha) 318 | graph.update(release_and_tag(release, tag)) 319 | graph.update(tag_and_commit(tag, commit)) 320 | return graph 321 | 322 | 323 | def release_and_tag(release: Optional[Release], tag: Tag, graph: ProvDocument = None): 324 | if graph is None: 325 | graph = graph_factory() 326 | t = graph.collection(*tag) 327 | 328 | if release is None: 329 | return graph 330 | 331 | r = graph.collection(*release) 332 | c = graph.activity(*release.creation) 333 | t.hadMember(r) 334 | r.wasGeneratedBy(c, time=c.get_startTime(), attributes=[(PROV_ROLE, ProvRole.RELEASE)]) 335 | for asset in release.assets: 336 | graph.entity(*asset).hadMember(graph.entity(*release)) 337 | for evidence in release.evidences: 338 | graph.entity(*evidence).hadMember(graph.entity(*release)) 339 | 340 | if release.author is None: 341 | return graph 342 | 343 | at = graph.agent(*release.author) 344 | r.wasAttributedTo(at) 345 | c.wasAssociatedWith( 346 | at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] 347 | ) 348 | 349 | return graph 350 | 351 | 352 | def tag_and_commit(tag: Tag, commit: Optional[GitlabCommit], graph: ProvDocument = None): 353 | if graph is None: 354 | graph = graph_factory() 355 | t = graph.collection(*tag) 356 | tc = graph.activity(*tag.creation) 357 | at = graph.agent(*tag.author) 358 | t.wasAttributedTo(at) 359 | t.wasGeneratedBy(tc, time=tc.get_startTime(), attributes=[(PROV_ROLE, ProvRole.TAG)]) 360 | tc.wasAssociatedWith( 361 | at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] 362 | ) 363 | 364 | if commit is None: 365 | return graph 366 | 367 | cmt = graph.entity(*commit) 368 | cc = graph.activity(*commit.creation) 369 | at = graph.agent(*commit.author) 370 | cmt.hadMember(t) 371 | cmt.wasAttributedTo(at) 372 | cmt.wasGeneratedBy(cc, time=cc.get_startTime(), attributes=[(PROV_ROLE, ProvRole.GIT_COMMIT)]) 373 | cc.wasAssociatedWith( 374 | at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] 375 | ) 376 | 377 | return graph 378 | 379 | 380 | MODELS = [ 381 | git_commit_model, 382 | gitlab_commit_model, 383 | gitlab_issue_model, 384 | gitlab_merge_request_model, 385 | gitlab_release_tag_model, 386 | ] 387 | -------------------------------------------------------------------------------- /gitlab2prov/prov/operations.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import hashlib 4 | from typing import Iterable, NamedTuple, Type 5 | 6 | from collections import defaultdict, Counter 7 | from pathlib import Path 8 | from typing import Optional, Sequence, Any 9 | from urllib.parse import urlencode 10 | 11 | from ruamel.yaml import YAML 12 | from prov.dot import prov_to_dot 13 | from prov.identifier import QualifiedName 14 | from prov.model import ( 15 | ProvDocument, 16 | ProvRecord, 17 | ProvElement, 18 | ProvRelation, 19 | ProvAgent, 20 | ProvEntity, 21 | ProvActivity, 22 | PROV_ROLE, 23 | PROV_TYPE, 24 | PROV_REC_CLS, 25 | ) 26 | 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | USERNAME = "name" 32 | USEREMAIL = "email" 33 | SERIALIZATION_FORMATS = ["json", "xml", "rdf", "provn", "dot"] 34 | DESERIALIZATION_FORMATS = ["rdf", "xml", "json"] 35 | 36 | 37 | def serialize_graph( 38 | graph: ProvDocument, format: str = "json", destination=None, encoding="utf-8" 39 | ) -> str | None: 40 | if format not in SERIALIZATION_FORMATS: 41 | raise ValueError("Unsupported serialization format.") 42 | if format == "dot": 43 | return prov_to_dot(graph).to_string().encode(encoding) 44 | return graph.serialize(format=format, destination=destination) 45 | 46 | 47 | def deserialize_graph(source: str = None, content: str = None): 48 | for format in DESERIALIZATION_FORMATS: 49 | try: 50 | return ProvDocument.deserialize(source=source, content=content, format=format) 51 | except: 52 | continue 53 | raise Exception 54 | 55 | 56 | def format_stats_as_ascii_table(stats: dict[str, int]) -> str: 57 | table = f"|{'Record Type':20}|{'Count':20}|\n+{'-'*20}+{'-'*20}+\n" 58 | for record_type, count in stats.items(): 59 | table += f"|{record_type:20}|{count:20}|\n" 60 | return table 61 | 62 | 63 | def format_stats_as_csv(stats: dict[str, int]) -> str: 64 | csv = f"Record Type, Count\n" 65 | for record_type, count in stats.items(): 66 | csv += f"{record_type}, {count}\n" 67 | return csv 68 | 69 | 70 | def stats(graph: ProvDocument, resolution: str, formatter=format_stats_as_ascii_table) -> str: 71 | elements = Counter(e.get_type().localpart for e in graph.get_records(ProvElement)) 72 | relations = Counter(r.get_type().localpart for r in graph.get_records(ProvRelation)) 73 | 74 | stats = dict(sorted(elements.items())) 75 | if resolution == "coarse": 76 | stats.update({"Relations": relations.total()}) 77 | if resolution == "fine": 78 | stats.update(sorted(relations.items())) 79 | return formatter(stats) 80 | 81 | 82 | def qualified_name(localpart: str) -> QualifiedName: 83 | namespace = graph_factory().get_default_namespace() 84 | return QualifiedName(namespace, localpart) 85 | 86 | 87 | def graph_factory(records: Optional[Sequence[ProvRecord]] = None) -> ProvDocument: 88 | if records is None: 89 | records = [] 90 | graph = ProvDocument(records) 91 | graph.set_default_namespace("http://github.com/dlr-sc/gitlab2prov/") 92 | return graph 93 | 94 | 95 | def combine(graphs: Iterable[ProvDocument]) -> ProvDocument: 96 | log.info(f"combine graphs {graphs}") 97 | try: 98 | acc = next(graphs) 99 | except StopIteration: 100 | return graph_factory() 101 | for graph in graphs: 102 | acc.update(graph) 103 | return dedupe(acc) 104 | 105 | 106 | class StrippedRelation(NamedTuple): 107 | s: QualifiedName 108 | t: QualifiedName 109 | type: Type[ProvRelation] 110 | 111 | 112 | def dedupe(graph: ProvDocument) -> ProvDocument: 113 | log.info(f"deduplicate ProvElement's and ProvRelation's in {graph=}") 114 | graph = graph.unified() 115 | records = list(graph.get_records((ProvElement))) 116 | 117 | bundles = dict() 118 | attributes = defaultdict(set) 119 | 120 | for relation in graph.get_records(ProvRelation): 121 | stripped = StrippedRelation( 122 | relation.formal_attributes[0], 123 | relation.formal_attributes[1], 124 | PROV_REC_CLS[relation.get_type()], 125 | ) 126 | bundles[stripped] = relation.bundle 127 | attributes[stripped].update(relation.extra_attributes) 128 | 129 | records.extend( 130 | relation.type( 131 | bundles[relation], 132 | None, 133 | [relation.s, relation.t] + list(attributes[relation]), 134 | ) 135 | for relation in attributes 136 | ) 137 | return graph_factory(records) 138 | 139 | 140 | def read(fp: Path) -> dict[str, list[str]]: 141 | with open(fp, "r") as f: 142 | data = f.read() 143 | d = json.loads(data) 144 | if not d: 145 | log.info(f"empty agent mapping") 146 | return dict() 147 | return d 148 | 149 | 150 | def read_duplicated_agent_mapping(fp: str): 151 | """Mapping that maps user names to a list of their aliases.""" 152 | with open(fp, "rt") as f: 153 | yaml = YAML(type="safe") 154 | agents = yaml.load(f.read()) 155 | return {agent["name"]: agent["aliases"] for agent in agents} 156 | 157 | 158 | def build_inverse_index(mapping): 159 | """Build the inverse index for a double agent mapping.""" 160 | return {alias: name for name, aliases in mapping.items() for alias in aliases} 161 | 162 | 163 | def uncover_name(agent: str, names: dict[str, str]) -> tuple[QualifiedName, str]: 164 | [(qn, name)] = [(key, val) for key, val in agent.attributes if key.localpart == "name"] 165 | return qn, names.get(name, name) 166 | 167 | 168 | def merge_duplicated_agents(graph, path_to_mapping): 169 | log.info(f"resolve aliases in {graph=}") 170 | mapping = read_duplicated_agent_mapping(path_to_mapping) 171 | names = build_inverse_index(mapping) 172 | 173 | # dict to temporarily store agent attributes 174 | attrs = defaultdict(set) 175 | # map of old agent identifiers to new agent identifiers 176 | # used to reroute relationships 177 | reroute = dict() 178 | # prov records that are not affected by this operation 179 | records = list(graph.get_records((ProvEntity, ProvActivity))) 180 | 181 | for agent in graph.get_records(ProvAgent): 182 | # resolve the agent alias (uncover its identity) 183 | name = uncover_name(agent, names) 184 | # rebuild the attributes of the current agent 185 | # start by adding the uncovered given name 186 | attrs[name].add(name) 187 | # add all other attributes aswell 188 | attrs[name].update(t for t in agent.attributes if t[0].localpart != "name") 189 | 190 | repr_attrs = [tpl for tpl in attrs[name] if tpl[1] in ("name", "email")] 191 | identifier = qualified_name(f"User?{urlencode(repr_attrs)}") 192 | records.append(ProvAgent(agent.bundle, identifier, attrs[name])) 193 | 194 | reroute[agent.identifier] = identifier 195 | 196 | for relation in graph.get_records(ProvRelation): 197 | formal = [(key, reroute.get(val, val)) for key, val in relation.formal_attributes] 198 | extra = [(key, reroute.get(val, val)) for key, val in relation.extra_attributes] 199 | r_type = PROV_REC_CLS.get(relation.get_type()) 200 | records.append(r_type(relation.bundle, relation.identifier, formal + extra)) 201 | 202 | return graph_factory(records).unified() 203 | 204 | 205 | def get_attribute(record: ProvRecord, attribute: str, first: bool = True) -> str | None: 206 | choices = list(record.get_attribute(attribute)) 207 | if not choices: 208 | return 209 | return choices[0] if first else choices 210 | 211 | 212 | def pseudonymize_agent( 213 | agent: ProvAgent, 214 | identifier: QualifiedName, 215 | keep: list[QualifiedName], 216 | replace: dict[str, Any], 217 | ) -> ProvAgent: 218 | kept = [(key, val) for key, val in agent.extra_attributes if key in keep] 219 | replaced = [ 220 | (key, replace.get(key.localpart, val)) 221 | for key, val in agent.extra_attributes 222 | if key.localpart in replace 223 | ] 224 | return ProvAgent(agent.bundle, identifier, kept + replaced) 225 | 226 | 227 | def pseudonymize(graph: ProvDocument) -> ProvDocument: 228 | log.info(f"pseudonymize agents in {graph=}") 229 | 230 | # get all records except for agents and relations 231 | records = list(graph.get_records((ProvActivity, ProvEntity))) 232 | 233 | pseudonyms = dict() 234 | for agent in graph.get_records(ProvAgent): 235 | name = get_attribute(agent, USERNAME) 236 | mail = get_attribute(agent, USEREMAIL) 237 | 238 | if name is None: 239 | raise ValueError("ProvAgent representing a user has to have a name!") 240 | 241 | # hash name & mail if present 242 | namehash = hashlib.sha256(bytes(name, "utf-8")).hexdigest() 243 | mailhash = hashlib.sha256(bytes(mail, "utf-8")).hexdigest() if mail else None 244 | # create a new id as a pseudonym using the hashes 245 | pseudonym = qualified_name(f"User?name={namehash}&email={mailhash}") 246 | 247 | # map the old id to the pseudonym 248 | pseudonyms[agent.identifier] = pseudonym 249 | 250 | # keep only prov role & prov type 251 | # replace name & mail with hashes 252 | pseudonymized = pseudonymize_agent( 253 | agent, 254 | identifier=pseudonym, 255 | keep=[PROV_ROLE, PROV_TYPE], 256 | replace={USERNAME: namehash, USEREMAIL: mailhash}, 257 | ) 258 | # add pseudonymized agent to the list of records 259 | records.append(pseudonymized) 260 | 261 | # replace old id occurences with the pseudonymized id 262 | for relation in graph.get_records(ProvRelation): 263 | formal = [(key, pseudonyms.get(val, val)) for key, val in relation.formal_attributes] 264 | extra = [(key, pseudonyms.get(val, val)) for key, val in relation.extra_attributes] 265 | r_type = PROV_REC_CLS.get(relation.get_type()) 266 | records.append(r_type(relation.bundle, relation.identifier, formal + extra)) 267 | 268 | return graph_factory(records) 269 | -------------------------------------------------------------------------------- /gitlab2prov/root.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | def get_package_root() -> Path: 5 | return Path(__file__).parent 6 | -------------------------------------------------------------------------------- /gitlab2prov/service_layer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/gitlab2prov/service_layer/__init__.py -------------------------------------------------------------------------------- /gitlab2prov/service_layer/handlers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from gitlab2prov.domain import commands 4 | from gitlab2prov.prov import model, operations 5 | from prov.model import ProvDocument 6 | 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | def fetch_git(cmd: commands.Fetch, uow, git_fetcher) -> None: 12 | with git_fetcher(cmd.url, cmd.token) as fetcher: 13 | fetcher.do_clone() 14 | with uow: 15 | for resource in fetcher.fetch_git(): 16 | log.info(f"add {resource=}") 17 | uow.resources.add(resource) 18 | uow.commit() 19 | 20 | 21 | def fetch_gitlab(cmd: commands.Fetch, uow, gitlab_fetcher) -> None: 22 | fetcher = gitlab_fetcher(cmd.url, cmd.token) 23 | fetcher.do_login() 24 | with uow: 25 | for resource in fetcher.fetch_gitlab(): 26 | log.info(f"add {resource=}") 27 | uow.resources.add(resource) 28 | uow.commit() 29 | 30 | 31 | def reset(cmd: commands.Reset, uow): 32 | log.info(f"reset repository {uow.resources=}") 33 | uow.reset() 34 | 35 | 36 | def serialize(cmd: commands.Serialize, uow) -> ProvDocument: 37 | log.info(f"serialize graph consisting of {model.MODELS=}") 38 | graph = operations.combine(prov_model(uow.resources) for prov_model in model.MODELS) 39 | graph = operations.dedupe(graph) 40 | return graph 41 | 42 | 43 | HANDLERS = { 44 | commands.Fetch: [ 45 | fetch_git, 46 | fetch_gitlab, 47 | ], 48 | commands.Reset: [reset], 49 | commands.Serialize: [serialize], 50 | } 51 | -------------------------------------------------------------------------------- /gitlab2prov/service_layer/messagebus.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass 3 | from typing import Callable 4 | 5 | from prov.model import ProvDocument 6 | 7 | from gitlab2prov.domain.commands import Command 8 | from gitlab2prov.service_layer.unit_of_work import AbstractUnitOfWork 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @dataclass 15 | class MessageBus: 16 | uow: AbstractUnitOfWork 17 | handlers: dict[type[Command], list[Callable]] 18 | 19 | def handle(self, command: Command) -> ProvDocument | None: 20 | # TODO: Return more than the last result... 21 | for handler in self.handlers[type(command)]: 22 | try: 23 | logger.debug(f"Handling command {command}.") 24 | result = handler(command) 25 | except Exception: 26 | logger.exception(f"Exception handling command {command}.") 27 | raise 28 | return result 29 | -------------------------------------------------------------------------------- /gitlab2prov/service_layer/unit_of_work.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | 5 | from gitlab2prov.adapters import repository 6 | 7 | 8 | class AbstractUnitOfWork(abc.ABC): 9 | def __enter__(self) -> AbstractUnitOfWork: 10 | return self 11 | 12 | def __exit__(self, *args): 13 | self.rollback() 14 | 15 | def commit(self): 16 | self._commit() 17 | 18 | def reset(self): 19 | self._reset() 20 | 21 | @abc.abstractmethod 22 | def _commit(self): 23 | raise NotImplementedError 24 | 25 | @abc.abstractmethod 26 | def _reset(self): 27 | raise NotImplementedError 28 | 29 | @abc.abstractmethod 30 | def rollback(self): 31 | raise NotImplementedError 32 | 33 | 34 | class InMemoryUnitOfWork(AbstractUnitOfWork): 35 | def __init__(self): 36 | self.resources = repository.InMemoryRepository() 37 | 38 | def __enter__(self): 39 | return super().__enter__() 40 | 41 | def __exit__(self, *args): 42 | super().__exit__(*args) 43 | 44 | def _commit(self): 45 | pass 46 | 47 | def _reset(self): 48 | self.resources = repository.InMemoryRepository() 49 | 50 | def rollback(self): 51 | pass 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=65.0", # MIT License 4 | "wheel", # MIT License 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "gitlab2prov" 10 | description = "Extract provenance information (W3C PROV) from GitLab projects." 11 | requires-python = ">=3.10" 12 | readme = { file = "README.md", content-type = "text/markdown" } 13 | license = { file = "LICENSE" } 14 | authors = [{ name = "Claas de Boer", email = "claas.deboer@dlr.de" }] 15 | maintainers = [ 16 | { name = "Andreas Schreiber", email = "andreas.schreiber@dlr.de" }, 17 | ] 18 | dependencies = [ 19 | "prov>=2.0.0", # MIT License 20 | "git-python", # BSD 3-Clause License 21 | "python-gitlab", # LGPL-3.0 License 22 | "jsonschema", # MIT License 23 | "ruamel.yaml", # MIt License 24 | "pydot>=1.2.0", # MIT License 25 | "click", # BSD 3-Clause License 26 | ] 27 | keywords = [ 28 | "prov", 29 | "gitlab", 30 | "git", 31 | "provenance", 32 | "prov generation", 33 | "software analytics", 34 | "w3c prov", 35 | ] 36 | classifiers = [ 37 | "Development Status :: 5 - Production/Stable", 38 | "Environment :: Console", 39 | "Intended Audience :: Science/Research", 40 | "License :: OSI Approved :: MIT License", 41 | "Programming Language :: Python", 42 | "Programming Language :: Python :: 3 :: Only", 43 | "Topic :: Scientific/Engineering", 44 | "Topic :: Scientific/Engineering :: Information Analysis", 45 | "Topic :: Software Development :: Version Control :: Git", 46 | ] 47 | dynamic = ["version"] 48 | 49 | [project.optional-dependencies] 50 | dev = [ 51 | "pytest", # MIT License 52 | "pytest-mock", # MIT License 53 | "black", # MIT License 54 | "isort", # MIT License 55 | "bump-my-version", # MIT License 56 | ] 57 | 58 | [project.scripts] 59 | gitlab2prov = "gitlab2prov.entrypoints.cli:cli" 60 | 61 | [project.urls] 62 | Twitter = "https://twitter.com/dlr_software" 63 | "Source Code" = "https://github.com/dlr-sc/gitlab2prov" 64 | "Issue Tracker" = "https://github.com/dlr-sc/gitlab2prov/issues" 65 | 66 | [tool.setuptools.dynamic] 67 | version = { attr = "gitlab2prov.__version__" } 68 | 69 | [tool.setuptools.packages.find] 70 | exclude = ["tests*", "docs*"] 71 | 72 | [tool.isort] 73 | profile = "black" 74 | py_version = 310 75 | 76 | [tool.black] 77 | line-length = 99 78 | target-version = ['py310'] 79 | 80 | [tool.bumpversion] 81 | current_version = "2.2.0" 82 | commit = true 83 | tag = true 84 | tag_name = "{new_version}" 85 | message = "Bump version: {current_version} -> {new_version}" 86 | 87 | [[tool.bumpversion.files]] 88 | filename = "gitlab2prov/__init__.py" 89 | search = '__version__ = "{current_version}"' 90 | replace = '__version__ = "{new_version}"' 91 | 92 | [[tool.bumpversion.files]] 93 | filename = "CITATION.cff" 94 | search = 'version: "{current_version}"' 95 | replace = 'version: "{new_version}"' 96 | 97 | [[tool.bumpversion.files]] 98 | filename = "CITATION.cff" 99 | search = 'date-released: "\d{{4}}-\d{{2}}-\d{{2}}"' 100 | replace = 'date-released: "{now:%Y-%m-%d}"' 101 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/tests/__init__.py -------------------------------------------------------------------------------- /tests/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/tests/e2e/__init__.py -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/test_repository.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from gitlab2prov.adapters import repository 4 | from gitlab2prov.domain import objects 5 | 6 | 7 | today = datetime.now() 8 | tomorrow = today + timedelta(days=1) 9 | yesterday = today - timedelta(days=1) 10 | 11 | 12 | class TestInMemoryRepository: 13 | def test_get(self): 14 | repo = repository.InMemoryRepository() 15 | u1 = objects.User(name="u1", email="e1", prov_role="r1") 16 | u2 = objects.User(name="u2", email="e2", prov_role="r2") 17 | repo.add(u1) 18 | repo.add(u2) 19 | assert repo.get(objects.User, name="u1") == u1 20 | assert repo.get(objects.User, name="u2") == u2 21 | 22 | def test_get_returns_none_if_repository_is_empty(self): 23 | repo = repository.InMemoryRepository() 24 | assert repo.get(objects.User, name="name") == None 25 | 26 | def test_list_all(self): 27 | repo = repository.InMemoryRepository() 28 | u1 = objects.User(name="u1", email="e1", prov_role="r1") 29 | u2 = objects.User(name="u2", email="e2", prov_role="r1") 30 | repo.add(u1) 31 | repo.add(u2) 32 | assert repo.list_all(objects.User, name="u1") == [u1] 33 | assert repo.list_all(objects.User, name="u2") == [u2] 34 | assert repo.list_all(objects.User, prov_role="r1") == [u1, u2] 35 | 36 | def test_list_all_returns_empty_list_if_repository_is_empty(self): 37 | repo = repository.InMemoryRepository() 38 | assert repo.list_all(objects.User, name="name") == [] 39 | -------------------------------------------------------------------------------- /tests/random_refs.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from gitlab2prov.domain import objects 3 | from gitlab2prov.domain.constants import ProvRole 4 | 5 | 6 | def random_suffix(): 7 | return uuid.uuid4().hex[:6] 8 | 9 | 10 | def random_user(): 11 | return objects.User( 12 | name=f"user-name-{random_suffix()}", 13 | email=f"user-email-{random_suffix()}", 14 | gitlab_username=f"gitlab-user-name-{random_suffix()}", 15 | gitlab_id=f"gitlab-user-id-{random_suffix()}", 16 | prov_role=ProvRole.AUTHOR, 17 | ) 18 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DLR-SC/gitlab2prov/daafc96c644d06e21d377ecc870fc3fda41d3528/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_annotation_parsing.py: -------------------------------------------------------------------------------- 1 | from gitlab2prov.adapters.fetch.annotations import CLASSIFIERS 2 | from gitlab2prov.adapters.fetch.annotations.parse import classify_system_note 3 | from gitlab2prov.adapters.fetch.annotations.parse import longest_matching_classifier 4 | from gitlab2prov.adapters.fetch.annotations.parse import normalize 5 | 6 | 7 | class TestNormalize: 8 | def test_removes_trailing_whitespace(self): 9 | string = " test " 10 | assert not normalize(string).startswith(" ") 11 | assert not normalize(string).endswith(" ") 12 | 13 | def test_lowercase(self): 14 | string = "TEST" 15 | assert normalize(string).islower() 16 | 17 | 18 | class TestLongestMatchingClassifier: 19 | def test_returns_classifier_with_the_longest_match(self): 20 | string = "changed epic to slug&123" 21 | assert longest_matching_classifier(string) is CLASSIFIERS[1] 22 | assert longest_matching_classifier(string).name == "change_epic" 23 | string = "close via merge request slug!123" 24 | assert longest_matching_classifier(string) is CLASSIFIERS[7] 25 | assert longest_matching_classifier(string).name == "close_by_external_merge_request" 26 | string = "enabled automatic add to merge train when the pipeline for 12345abcde succeeds" 27 | assert longest_matching_classifier(string) is CLASSIFIERS[-1] 28 | assert longest_matching_classifier(string).name == "enable_automatic_add_to_merge_train" 29 | 30 | def test_returns_none_if_no_match_was_found(self): 31 | string = "NOT_MATCHABLE" 32 | assert longest_matching_classifier(string) is None 33 | 34 | 35 | class TestClassifySystemNote: 36 | def test_returns_import_statement_capture_groups(self): 37 | expected_captures = {"pre_import_author": "original-author"} 38 | string = "*by original-author on 1970-01-01T00:00:00 (imported from gitlab project)*" 39 | assert classify_system_note(string)[1] == expected_captures 40 | string = "*by original-author on 1970-01-01 00:00:00 UTC (imported from gitlab project)*" 41 | assert classify_system_note(string)[1] == expected_captures 42 | 43 | def test_returns_annotation_classifier_capture_groups(self): 44 | string = "assigned to @developer" 45 | expected_captures = {"user_name": "developer"} 46 | assert classify_system_note(string)[1] == expected_captures 47 | 48 | def test_returns_combined_capture_groups_of_the_import_statement_and_the_classifier( 49 | self, 50 | ): 51 | string = "assigned to @developer *by original-author on 1970-01-01T00:00:00 (imported from gitlab project)*" 52 | expected_captures = { 53 | "user_name": "developer", 54 | "pre_import_author": "original-author", 55 | } 56 | assert classify_system_note(string)[1] == expected_captures 57 | 58 | def test_returns_classifier_name_for_known_string(self): 59 | string = "assigned to @developer" 60 | expected_name = "assign_user" 61 | assert classify_system_note(string)[0] == expected_name 62 | 63 | def test_returns_default_annotation_for_unknown_string(self): 64 | string = "UNKNOWN" 65 | expected_name = "default_annotation" 66 | assert classify_system_note(string)[0] == expected_name 67 | -------------------------------------------------------------------------------- /tests/unit/test_classifiers.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import string 4 | 5 | import pytest 6 | 7 | from gitlab2prov.adapters.fetch.annotations.classifiers import Classifier 8 | from gitlab2prov.adapters.fetch.annotations.classifiers import ImportStatement 9 | from gitlab2prov.adapters.fetch.annotations.classifiers import match_length 10 | 11 | 12 | class TestMatchLength: 13 | def test_raises_value(self): 14 | with pytest.raises(TypeError): 15 | match_length(None) 16 | 17 | def test_match_length_with_n_length_matches(self): 18 | for idx in range(1, 1000): 19 | pattern = r"\d{%d}" % idx 20 | s = "".join(random.choices(string.digits, k=idx)) 21 | match = re.search(pattern, s) 22 | assert match_length(match) == idx 23 | 24 | 25 | class TestClassifier: 26 | def test_longest_matching_classifier_wins_selection(self): 27 | classifiers = [ 28 | Classifier(patterns=[r"\d{1}"]), 29 | Classifier(patterns=[r"\d{2}"]), 30 | Classifier(patterns=[r"\d{3}"]), 31 | ] 32 | for classifier in classifiers: 33 | classifier.matches(string.digits) 34 | assert max(classifiers, key=len) == classifiers[-1] 35 | 36 | def test_matches_should_return_true_if_any_pattern_matches(self): 37 | classifier = Classifier(patterns=[r"\d", r"\s"]) 38 | assert classifier.matches(string.digits) == True 39 | 40 | def test_matches_should_return_false_if_no_pattern_matches(self): 41 | c = Classifier(patterns=[r"\d", r"\s"]) 42 | assert c.matches(string.ascii_letters) == False 43 | 44 | def test_matches_should_store_the_longest_match_in_the_class_attributes(self): 45 | regexes = [r"\d{1}", r"\d{2}", r"\d{3}"] 46 | classifier = Classifier(patterns=regexes) 47 | classifier.matches(string.digits) 48 | assert classifier.match.re.pattern == regexes[-1] 49 | 50 | def test_groupdict_should_return_empty_dict_if_no_pattern_matches(self): 51 | classifier = Classifier(patterns=[r"\d"]) 52 | classifier.matches(string.ascii_letters) 53 | assert classifier.groupdict() == dict() 54 | 55 | def test_groupdict_should_return_captured_groups_if_a_pattern_matches(self): 56 | classifier = Classifier(patterns=[r"(?P\d)"]) 57 | classifier.matches(string.digits) 58 | assert classifier.groupdict() == {"number": string.digits[0]} 59 | 60 | def test_length_should_be_0_if_no_match_was_found(self): 61 | classifier = Classifier(patterns=[r"\d"]) 62 | classifier.matches(string.ascii_letters) 63 | assert len(classifier) == 0 64 | 65 | def test_length_should_be_the_span_of_the_found_match(self): 66 | classifier = Classifier(patterns=[r"\d"]) 67 | classifier.matches(string.digits) 68 | assert len(classifier) == 1 69 | 70 | 71 | class TestImportStatement: 72 | def test_replace_returns_unchanged_string_if_no_match_was_found(self): 73 | imp = ImportStatement(patterns=[r"\d{3}"]) 74 | imp.matches(string.ascii_letters) 75 | assert imp.replace(string.ascii_letters) == string.ascii_letters 76 | 77 | def test_import_statement_removes_only_the_leftmost_occurence(self): 78 | imp = ImportStatement(patterns=[r"\d{3}"]) 79 | imp.matches(string.digits) 80 | assert imp.replace(string.digits) == string.digits[3:] 81 | 82 | def test_removes_trailing_whitespace_after_import_pattern_replacement(self): 83 | imp = ImportStatement(patterns=[r"\d{3}"]) 84 | s = f"{string.whitespace}{string.digits}{string.whitespace}" 85 | imp.matches(s) 86 | assert not imp.replace(s).endswith(" ") 87 | assert not imp.replace(s).startswith(" ") 88 | -------------------------------------------------------------------------------- /tests/unit/test_fetch_utils.py: -------------------------------------------------------------------------------- 1 | from gitlab2prov.adapters.fetch import utils 2 | 3 | 4 | class TestHelpers: 5 | def test_project_slug(self): 6 | expected_slug = "group/project" 7 | assert expected_slug == utils.project_slug("https://gitlab.com/group/project") 8 | 9 | def test_gitlab_url(self): 10 | expected_url = "https://gitlab.com" 11 | assert expected_url == utils.gitlab_url("https://gitlab.com/group/project") 12 | 13 | def test_clone_over_https_url(self): 14 | expected_url = "https://gitlab.com:TOKEN@gitlab.com/group/project" 15 | assert expected_url == utils.clone_over_https_url( 16 | "https://gitlab.com/group/project", "TOKEN" 17 | ) 18 | -------------------------------------------------------------------------------- /tests/unit/test_handlers.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Type, Optional 2 | 3 | from gitlab2prov import bootstrap 4 | from gitlab2prov.adapters import repository 5 | from gitlab2prov.service_layer import unit_of_work 6 | 7 | 8 | R = TypeVar("R") 9 | 10 | 11 | class FakeRepository(repository.AbstractRepository): 12 | def __init__(self, resources: R): 13 | self._resources = set(resources) 14 | 15 | def _add(self, resource: R): 16 | self._resources.add(resource) 17 | 18 | def _get(self, resource_type: Type[R], **filters) -> Optional[R]: 19 | return next( 20 | ( 21 | r 22 | for r in self._resources 23 | if all(getattr(r, key) == val for key, val in filters.items()) 24 | ) 25 | ) 26 | 27 | def _list_all(self, resource_type: Type[R], **filters) -> list[R]: 28 | return [ 29 | r 30 | for r in self._resources 31 | if all(getattr(r, key) == val for key, val in filters.items()) 32 | ] 33 | 34 | 35 | class FakeUnitOfWork(unit_of_work.AbstractUnitOfWork): 36 | def __init__(self): 37 | self.resources = FakeRepository([]) 38 | self.committed = False 39 | 40 | def _commit(self): 41 | self.committed = True 42 | 43 | def rollback(self): 44 | pass 45 | 46 | 47 | def FakeGitFetcher(resources): 48 | class FakeGitRepositoryMiner: 49 | def __init__(self, url, token): 50 | self.resources = resources 51 | 52 | def __enter__(self): 53 | return self 54 | 55 | def __exit__(self, exc_type, exc_val, exc_tb): 56 | pass 57 | 58 | def do_clone(self): 59 | pass 60 | 61 | def fetch_git(self): 62 | return iter(self.resources) 63 | 64 | return FakeGitRepositoryMiner 65 | 66 | 67 | def FakeGitlabFetcher(resources): 68 | class FakeGitlabFetcher: 69 | def __init__(self, url, token): 70 | self.resources = resources 71 | 72 | def do_login(self): 73 | pass 74 | 75 | def fetch_gitlab(self): 76 | return iter(self.resources) 77 | 78 | return FakeGitlabFetcher 79 | 80 | 81 | def bootstrap_test_app(git_resources=None, gitlab_resources=None): 82 | if git_resources is None: 83 | git_resources = [] 84 | if gitlab_resources is None: 85 | gitlab_resources = [] 86 | return bootstrap.bootstrap( 87 | uow=FakeUnitOfWork(), 88 | git_fetcher=FakeGitFetcher(git_resources), 89 | gitlab_fetcher=FakeGitlabFetcher(gitlab_resources), 90 | ) 91 | 92 | 93 | class TestHandlers: 94 | pass 95 | -------------------------------------------------------------------------------- /tests/unit/test_operations.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from prov.model import ProvAgent, ProvDocument, ProvRelation, PROV_ROLE, PROV_TYPE 4 | 5 | from gitlab2prov.prov import operations 6 | from gitlab2prov.prov.operations import qualified_name 7 | 8 | from tests.random_refs import random_suffix 9 | 10 | 11 | class TestStats: 12 | def test_format_as_ascii_table(self): 13 | d = {"A": 1, "B": 2, "C": 3} 14 | expected_header = [ 15 | f"|{'Record Type':20}|{'Count':20}|", 16 | f"+{'-'*20}+{'-'*20}+", 17 | ] 18 | expected_body = [ 19 | f"|{'A':20}|{1:20}|", 20 | f"|{'B':20}|{2:20}|", 21 | f"|{'C':20}|{3:20}|", 22 | ] 23 | table = operations.format_stats_as_ascii_table(d) 24 | lines = [l.strip() for l in table.split("\n") if l] 25 | assert lines[:2] == expected_header 26 | assert lines[2:] == expected_body 27 | 28 | def test_format_stats_as_csv(self): 29 | d = {"A": 1, "B": 2, "C": 3} 30 | expected_header = ["Record Type, Count"] 31 | expected_body = [ 32 | "A, 1", 33 | "B, 2", 34 | "C, 3", 35 | ] 36 | csv = operations.format_stats_as_csv(d) 37 | lines = [l.strip() for l in csv.split("\n") if l] 38 | assert lines[:1] == expected_header 39 | assert lines[1:] == expected_body 40 | 41 | 42 | class TestGraphFactory: 43 | def test_namespace_uri_is_gitlab2prov(self): 44 | graph = operations.graph_factory() 45 | expected_uri = "http://github.com/dlr-sc/gitlab2prov/" 46 | assert graph.get_default_namespace().uri == expected_uri 47 | 48 | def test_init_wo_list_of_records(self): 49 | uri = "http://github.com/dlr-sc/gitlab2prov/" 50 | expected_graph = ProvDocument() 51 | expected_graph.set_default_namespace(uri) 52 | assert operations.graph_factory() == expected_graph 53 | 54 | def test_init_with_list_of_records(self): 55 | records = [ 56 | ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")), 57 | ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")), 58 | ] 59 | expected_graph = ProvDocument(records) 60 | assert operations.graph_factory(records) == expected_graph 61 | 62 | 63 | class TestCombine: 64 | def test_returns_empty_graph_when_run_wo_subgraphs(self): 65 | assert operations.combine(iter([])) == operations.graph_factory() 66 | 67 | def test_carries_over_all_records(self): 68 | agent1 = ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")) 69 | agent2 = ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")) 70 | graph1 = ProvDocument([agent1]) 71 | graph2 = ProvDocument([agent2]) 72 | subgraphs = [graph1, graph2] 73 | expected_graph = ProvDocument([agent1, agent2]) 74 | assert operations.combine(iter(subgraphs)) == expected_graph 75 | 76 | 77 | class TestDedupe: 78 | def test_removes_duplicate_elements(self): 79 | agent = ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")) 80 | graph = ProvDocument([agent, agent]) 81 | expected_graph = ProvDocument([agent]) 82 | assert list(graph.get_records(ProvAgent)) == [agent, agent] 83 | assert list(operations.dedupe(graph).get_records(ProvAgent)) == [agent] 84 | assert operations.dedupe(graph) == expected_graph 85 | 86 | def test_merges_attributes_of_duplicate_elements(self): 87 | id = qualified_name(f"agent-id-{random_suffix()}") 88 | graph = ProvDocument() 89 | graph.agent(id, {"attribute1": 1}) 90 | graph.agent(id, {"attribute2": 2}) 91 | expected_attributes = [ 92 | (qualified_name("attribute1"), 1), 93 | (qualified_name("attribute2"), 2), 94 | ] 95 | agents = list(operations.dedupe(graph).get_records(ProvAgent)) 96 | assert len(agents) == 1 97 | assert agents[0].attributes == expected_attributes 98 | 99 | def test_remove_duplicate_relations(self): 100 | graph = ProvDocument() 101 | agent = graph.agent(qualified_name(f"agent-id-{random_suffix()}")) 102 | entity = graph.entity(qualified_name(f"entity-id-{random_suffix()}")) 103 | r1 = graph.wasAttributedTo(entity, agent) 104 | r2 = graph.wasAttributedTo(entity, agent) 105 | assert list(graph.get_records(ProvRelation)) == [r1, r2] 106 | assert list(operations.dedupe(graph).get_records(ProvRelation)) == [r1] 107 | 108 | def test_merges_attributes_of_duplicate_relations(self): 109 | graph = ProvDocument() 110 | agent = graph.agent(qualified_name(f"agent-id-{random_suffix()}")) 111 | entity = graph.entity(qualified_name(f"entity-id-{random_suffix()}")) 112 | r1_attrs = [(qualified_name("attr"), "val1")] 113 | r2_attrs = [(qualified_name("attr"), "val2")] 114 | graph.wasAttributedTo(entity, agent, other_attributes=r1_attrs) 115 | graph.wasAttributedTo(entity, agent, other_attributes=r2_attrs) 116 | 117 | graph = operations.dedupe(graph) 118 | 119 | relations = list(graph.get_records(ProvRelation)) 120 | assert len(relations) == 1 121 | expected_extra_attributes = set( 122 | [ 123 | (qualified_name("attr"), "val1"), 124 | (qualified_name("attr"), "val2"), 125 | ] 126 | ) 127 | assert set(relations[0].extra_attributes) == expected_extra_attributes 128 | 129 | 130 | class TestUncoverDoubleAgents: 131 | def test_build_inverse_index(self): 132 | mapping = {"name": ["alias1", "alias2"]} 133 | expected_dict = {"alias1": "name", "alias2": "name"} 134 | assert operations.build_inverse_index(mapping) == expected_dict 135 | 136 | def test_uncover_name(self): 137 | names = {"alias": "name"} 138 | graph = operations.graph_factory() 139 | agent = graph.agent("agent-id", other_attributes={qualified_name("name"): "alias"}) 140 | expected_name = (qualified_name("name"), "name") 141 | assert operations.uncover_name(agent, names) == expected_name 142 | 143 | def test_uncover_duplicated_agents_resolves_agent_alias(self, mocker): 144 | d = {"alias1": "name", "alias2": "name"} 145 | mocker.patch("gitlab2prov.prov.operations.read_duplicated_agent_mapping") 146 | mocker.patch("gitlab2prov.prov.operations.build_inverse_index", return_value=d) 147 | 148 | graph = operations.graph_factory() 149 | graph.agent("agent1", {"name": "alias2"}) 150 | graph.agent("agent2", {"name": "alias1"}) 151 | 152 | graph = operations.merge_duplicated_agents(graph, "") 153 | 154 | agents = list(graph.get_records(ProvAgent)) 155 | assert len(agents) == 1 156 | expected_name = "name" 157 | [(_, name)] = [(k, v) for k, v in agents[0].attributes if k.localpart == "name"] 158 | assert name == expected_name 159 | 160 | def test_uncover_duplicated_agents_reroutes_relations(self, mocker): 161 | d = {"alias1": "name", "alias2": "name"} 162 | mocker.patch("gitlab2prov.prov.operations.read_duplicated_agent_mapping") 163 | mocker.patch("gitlab2prov.prov.operations.build_inverse_index", return_value=d) 164 | 165 | graph = operations.graph_factory() 166 | a1 = graph.agent("agent1", {"name": "alias2"}) 167 | a2 = graph.agent("agent2", {"name": "alias1"}) 168 | e1 = graph.entity("entity1") 169 | e2 = graph.entity("entity2") 170 | e1.wasAttributedTo(a1) 171 | e2.wasAttributedTo(a2) 172 | 173 | graph = operations.merge_duplicated_agents(graph, "") 174 | 175 | relations = list(graph.get_records(ProvRelation)) 176 | assert len(relations) == 2 177 | expected_identifier = "User?name=name" 178 | assert all( 179 | relation.formal_attributes[1][1].localpart == expected_identifier 180 | for relation in relations 181 | ) 182 | 183 | 184 | class TestPseudonymize: 185 | def test_pseudonymize_changes_agent_name_and_identifier(self): 186 | graph = operations.graph_factory() 187 | name = f"agent-name-{random_suffix()}" 188 | email = f"agent-email-{random_suffix()}" 189 | graph.agent("agent1", {"name": name, "email": email}) 190 | 191 | graph = operations.pseudonymize(graph) 192 | 193 | expected_name = hashlib.sha256(bytes(name, "utf-8")).hexdigest() 194 | expected_email = hashlib.sha256(bytes(email, "utf-8")).hexdigest() 195 | expected_identifier = qualified_name(f"User?name={expected_name}&email={expected_email}") 196 | 197 | agent = next(graph.get_records(ProvAgent)) 198 | assert agent.identifier == expected_identifier 199 | assert list(agent.get_attribute("name"))[0] == expected_name 200 | assert list(agent.get_attribute("email"))[0] == expected_email 201 | 202 | def test_pseudonymize_deletes_non_name_attributes_apart_from_role_and_type(self): 203 | graph = operations.graph_factory() 204 | graph.agent( 205 | "agent1", 206 | { 207 | "name": f"agent-name-{random_suffix()}", 208 | "email": f"email-{random_suffix()}", 209 | "gitlab_username": f"gitlab-username-{random_suffix()}", 210 | "gitlab_id": f"gitlab-id-{random_suffix()}", 211 | PROV_ROLE: f"prov-role-{random_suffix()}", 212 | PROV_TYPE: f"prov-type-{random_suffix()}", 213 | }, 214 | ) 215 | 216 | graph = operations.pseudonymize(graph) 217 | 218 | agent = next(graph.get_records(ProvAgent)) 219 | expected_attributes = [ 220 | PROV_ROLE, 221 | PROV_TYPE, 222 | qualified_name("name"), 223 | qualified_name("email"), 224 | ] 225 | assert all([(attr in expected_attributes) for (attr, _) in agent.extra_attributes]) 226 | --------------------------------------------------------------------------------