├── .dockerignore ├── docs ├── changelog_link.md ├── favicon.ico ├── cli.rst ├── intro │ ├── linked_data.rst │ ├── git.rst │ ├── quickstart.rst │ ├── usage_python.rst │ └── tokens.rst ├── Makefile ├── make.bat ├── index.rst ├── conf.py ├── logo_notext.svg └── logo.svg ├── gimie ├── parsers │ ├── license │ │ ├── data │ │ │ ├── tfidf_matrix.npz │ │ │ ├── spdx_licenses.csv │ │ │ └── tfidf_vectorizer.json │ │ └── __init__.py │ ├── abstract.py │ ├── __init__.py │ └── cff.py ├── graph │ ├── __init__.py │ ├── namespaces.py │ └── operations.py ├── __init__.py ├── extractors │ ├── abstract.py │ ├── common │ │ └── queries.py │ ├── __init__.py │ ├── git.py │ ├── gitlab.py │ └── github.py ├── project.py ├── io.py ├── utils │ ├── uri.py │ └── text_processing.py ├── cli.py └── models.py ├── tests ├── conftest.py ├── test_project.py ├── test_cli.py ├── test_github.py ├── test_parsers.py ├── test_gitlab.py ├── test_output.py ├── test_tfidf.py ├── test_git.py └── test_cff.py ├── NOTICE ├── .env.dist ├── .docker ├── entrypoint.sh └── Dockerfile ├── .pre-commit-config.yaml ├── .github └── workflows │ ├── conventional-prs.yml │ ├── sphinx-docs.yml │ ├── poetry-pytest.yml │ ├── poetry-publish.yml │ ├── poetry-test-publish.yml │ └── docker-publish.yml ├── Makefile ├── CITATION.cff ├── scripts └── generate_tfidf.py ├── .gitignore ├── pyproject.toml ├── CHANGELOG.md ├── README.md └── LICENSE /.dockerignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | *__pycache__* 3 | -------------------------------------------------------------------------------- /docs/changelog_link.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CHANGELOG.md 2 | ``` 3 | -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdsc-ordes/gimie/HEAD/docs/favicon.ico -------------------------------------------------------------------------------- /gimie/parsers/license/data/tfidf_matrix.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdsc-ordes/gimie/HEAD/gimie/parsers/license/data/tfidf_matrix.npz -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface 2 | ********************** 3 | 4 | .. click:: gimie.cli:cli 5 | :prog: gimie 6 | :nested: full 7 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Avoid _pytest.pathlib.ImportPathMismatchError for pytest""" 2 | import os 3 | 4 | os.environ["PY_IGNORE_IMPORTMISMATCH"] = "1" 5 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Gimie 2 | Copyright 2022 - Swiss Data Science Center (SDSC) 3 | A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | -------------------------------------------------------------------------------- /.env.dist: -------------------------------------------------------------------------------- 1 | # create your personal github token and add it here 2 | # see [here](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) 3 | GITHUB_TOKEN= 4 | GITLAB_TOKEN= 5 | -------------------------------------------------------------------------------- /.docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | # If the first argument starts with "-" or if it is not recognized as a command, 6 | # use "gimie" as command 7 | if [ -z "${1##-*}" ] || [ -z "$(command -v $1)" ] ; then 8 | set -- gimie "$@" 9 | fi 10 | 11 | exec "$@" 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.10.0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /tests/test_project.py: -------------------------------------------------------------------------------- 1 | """Test the project module.""" 2 | import pytest 3 | 4 | from gimie.extractors import GIT_PROVIDERS 5 | from gimie.project import get_extractor 6 | 7 | 8 | def test_get_extractor(): 9 | repo = "https://example.org/group/project" 10 | for prov, extractor in GIT_PROVIDERS.items(): 11 | assert type(get_extractor(repo, prov)) == extractor 12 | 13 | with pytest.raises(ValueError): 14 | get_extractor(repo, "bad_provider") 15 | -------------------------------------------------------------------------------- /docs/intro/linked_data.rst: -------------------------------------------------------------------------------- 1 | Linked data 2 | *********** 3 | 4 | The aim of gimie is to extract project metadata in an interoperable format. This is achieved by generating `linked data `_ following the widely used `schema.org `_ ontology. The resulting metadata can readily be augmented or integrated with other data sources. 5 | 6 | Gimie's output follows recommendations provided by the `codemeta project `_ , but also provides additional properties. 7 | -------------------------------------------------------------------------------- /.github/workflows/conventional-prs.yml: -------------------------------------------------------------------------------- 1 | name: Conventional PR title 2 | on: 3 | pull_request_target: 4 | types: 5 | - opened 6 | - reopened 7 | - edited 8 | - synchronize 9 | 10 | jobs: 11 | title-format: 12 | runs-on: ubuntu-latest 13 | steps: 14 | # https://github.com/amannn/action-semantic-pull-request 15 | - name: PR title format check 16 | uses: amannn/action-semantic-pull-request@v5.3.0 17 | continue-on-error: true 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | with: 21 | validateSingleCommit: true 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """Tests for the Gimie command line interface.""" 2 | 3 | from gimie import cli 4 | from typer.testing import CliRunner 5 | 6 | runner = CliRunner() 7 | 8 | 9 | def test_data_help(): 10 | """Checks if the 'gimie data --help' command exits successfully.""" 11 | result = runner.invoke(cli.app, ["data", "--help"]) 12 | assert result.exit_code == 0 13 | 14 | 15 | def test_advice_help(): 16 | """Checks if the 'gimie advice --help' command exits successfully.""" 17 | result = runner.invoke(cli.app, ["advice", "--help"]) 18 | assert result.exit_code == 0 19 | 20 | 21 | def test_parsers_help(): 22 | """Checks if the 'gimie parsers --help' command exits successfully.""" 23 | result = runner.invoke(cli.app, ["parsers", "--help"]) 24 | assert result.exit_code == 0 25 | -------------------------------------------------------------------------------- /docs/intro/git.rst: -------------------------------------------------------------------------------- 1 | Git repositories 2 | **************** 3 | 4 | Software projects are usually version-controlled and hosted on a server. Git is by far the most popular version control system, and is commonly used for scientific software and data science projects. 5 | 6 | Git natively stores some metadata about the project authors and contributions in a local index, but git providers (servers) such has Github and GitLab store and expose more advanced information about the project and contributors. These information are served in provider-dependent format with specific APIs. 7 | 8 | Gimie aims to provide provider-agnostic metadata in an interoperable format. It will request data from the provider API if available, or from git by cloning the repository into a temporary folder otherwise. This metadata is then converted to the widely used schema.org standard so that it can readily be integrated with other tools and services. 9 | -------------------------------------------------------------------------------- /gimie/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from typing import Tuple, Union 18 | 19 | from rdflib.term import Literal, URIRef 20 | 21 | Property = Tuple[URIRef, Union[URIRef, Literal]] 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /tests/test_github.py: -------------------------------------------------------------------------------- 1 | # Tests fetching metadata from GitHub repositories with different setups. 2 | import pytest 3 | 4 | from gimie.extractors.github import GithubExtractor 5 | from gimie.io import RemoteResource 6 | 7 | 8 | TEST_REPOS = [ 9 | "https://github.com/sdsc-ordes/gimie", # Owned by organization, has releases 10 | "https://github.com/apache/openoffice", # Owned by organization, no releases 11 | "https://github.com/ishepard/pydriller", # Owned by user, has releases 12 | "https://github.com/rmfranken/license_test", # Contains 2 license files 13 | ] 14 | 15 | 16 | @pytest.mark.parametrize("repo", TEST_REPOS) 17 | def test_github_extract(repo): 18 | meta = GithubExtractor(repo).extract() 19 | meta.serialize(format="ttl") 20 | 21 | 22 | @pytest.mark.parametrize("repo", TEST_REPOS) 23 | def test_github_list_files(repo): 24 | files = GithubExtractor(repo).list_files() 25 | assert all(isinstance(f, RemoteResource) for f in files) 26 | -------------------------------------------------------------------------------- /tests/test_parsers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gimie.io import LocalResource 4 | from gimie.parsers import get_parser, list_parsers, parse_files 5 | from rdflib import URIRef 6 | from rdflib import Graph, URIRef, Literal 7 | 8 | 9 | def test_get_parser(): 10 | # All parsers are available 11 | for name in list_parsers(): 12 | get_parser(name) 13 | 14 | 15 | def test_get_bad_parser(): 16 | # Should raise error if parser not found 17 | with pytest.raises(ValueError): 18 | get_parser("bad_parser") 19 | 20 | 21 | def test_parse_license(): 22 | license_file = LocalResource("LICENSE") 23 | graph = parse_files( 24 | subject=URIRef("https://exmaple.org/"), files=[license_file] 25 | ) 26 | assert "https://spdx.org" in graph.serialize(format="ttl") 27 | 28 | 29 | def test_parse_nothing(): 30 | folder = LocalResource("tests") 31 | graph = parse_files(subject=URIRef("https://example.org/"), files=[folder]) 32 | assert len(graph) == 0 33 | -------------------------------------------------------------------------------- /tests/test_gitlab.py: -------------------------------------------------------------------------------- 1 | from gimie.io import RemoteResource 2 | from gimie.extractors.gitlab import GitlabExtractor 3 | import pytest 4 | 5 | TEST_REPOS = [ 6 | "https://gitlab.com/inkscape/inkscape", # Owned by multiple persons, has releases 7 | "https://gitlab.com/openrgb-pvazny/OpenRGB", # No user owner so group owner, no releases 8 | "https://gitlab.com/gitlab-org/gitlab-runner", # No user owner so group owner, has releases 9 | "https://gitlab.com/commonground/haven/haven", # Nested groups 10 | "https://gitlab.com/edouardklein/falsisign", # owned by user 11 | "https://gitlab.com/rmfranken/test-licenses", # Contains 2 license files 12 | ] 13 | 14 | 15 | @pytest.mark.parametrize("repo", TEST_REPOS) 16 | def test_gitlab_extract(repo): 17 | extractor = GitlabExtractor(repo) 18 | meta = extractor.extract() 19 | meta.serialize(format="ttl") 20 | 21 | 22 | @pytest.mark.parametrize("repo", TEST_REPOS) 23 | def test_gitlab_list_files(repo): 24 | files = GitlabExtractor(repo).list_files() 25 | assert all(isinstance(f, RemoteResource) for f in files) 26 | -------------------------------------------------------------------------------- /gimie/graph/namespaces.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from rdflib.namespace import Namespace 18 | 19 | SDO = Namespace("http://schema.org/") 20 | COD = Namespace("https://doi.org/10.5063/schema/codemeta-2.0/") 21 | SD = Namespace("https://w3id.org/okn/o/sd/1.9.0/") 22 | BIO = Namespace("https://bioschemas.org/") 23 | GIMIE = Namespace("https://sdsc-ordes.github.io/gimie/") 24 | MD4I = Namespace("http://w3id.org/nfdi4ing/metadata4ing#") 25 | -------------------------------------------------------------------------------- /gimie/__init__.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import logging 19 | 20 | import importlib.metadata as importlib_metadata 21 | 22 | __version__ = importlib_metadata.version(__name__) 23 | 24 | logger = logging.getLogger() 25 | stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s") 26 | stream_handler = logging.StreamHandler() 27 | stream_handler.setLevel(logging.WARNING) 28 | stream_handler.setFormatter(stdout_formatter) 29 | logger.addHandler(stream_handler) 30 | -------------------------------------------------------------------------------- /tests/test_output.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Test the gimie output""" 18 | import pytest 19 | from rdflib import Graph 20 | 21 | from gimie.project import Project 22 | 23 | 24 | OUT_TTL = ( 25 | Project("https://github.com/sdsc-ordes/gimie", git_provider="github") 26 | .extract() 27 | .serialize(format="ttl") 28 | ) 29 | 30 | 31 | def test_validate_output_is_linked_data(): 32 | """Is output valid RDF?""" 33 | g = Graph().parse(format="ttl", data=OUT_TTL) 34 | assert g is not None 35 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. gimie documentation master file, created by 2 | sphinx-quickstart on Tue Jun 6 16:50:55 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. image:: logo.svg 7 | :width: 200 8 | :alt: gimie logo 9 | 10 | 11 | Welcome to gimie's documentation! 12 | ================================= 13 | gimie (Git Meta Information Extractor) is a python library and command line tool to extract structured metadata from git repositories. 14 | 15 | .. card:: :octicon:`mark-github;2em` `GitHub repository `_ 16 | 17 | Visit gimie's GitHub repository to follow the latest developments! 18 | 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | :caption: Background 23 | 24 | Linked data - What is it and why do we use it? 25 | Git repositories - Where code lives 26 | Access tokens - Authenticate gimie on your behalf 27 | 28 | .. toctree:: 29 | :maxdepth: 1 30 | :caption: Documentation 31 | 32 | intro/quickstart 33 | intro/usage_python 34 | API Documentation 35 | CLI Documentation 36 | 37 | .. toctree:: changelog_link 38 | :maxdepth: 1 39 | :caption: Changelog 40 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: install 2 | install: ## Install with the poetry and add pre-commit hooks 3 | @echo "🚀 Installing packages with poetry" 4 | @poetry install 5 | @poetry run pre-commit install 6 | 7 | .PHONY: check 8 | check: ## Run code quality tools. 9 | @echo "🚀 Checking Poetry lock file consistency with 'pyproject.toml': Running poetry lock --check" 10 | @poetry lock --check 11 | @echo "🚀 Linting code: Running pre-commit" 12 | @poetry run pre-commit run -a 13 | 14 | .PHONY: doc 15 | doc: ## Build sphinx documentation website locally 16 | @echo "📖 Building documentation" 17 | @cd docs 18 | @poetry run sphinx-apidoc -d 3 -f -o docs/api gimie 19 | @poetry run sphinx-build docs/ docs/_build 20 | 21 | .PHONY: docker-build 22 | docker-build: ## Build the gimie Docker image 23 | @echo "🐋 Building docker image" 24 | @docker build -t gimie -f .docker/Dockerfile . 25 | 26 | .PHONY: test 27 | test: ## Test the code with pytest 28 | @echo "🚀 Testing code: Running pytest" 29 | @poetry run pytest 30 | 31 | .PHONY: changelog 32 | changelog: ## Generate the changelog 33 | @git-cliff -l -c pyproject.toml || echo "git-cliff must be installed" 34 | 35 | .PHONY: help 36 | help: 37 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' 38 | 39 | .DEFAULT_GOAL := help 40 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: gimie 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Cyril 12 | family-names: Matthey-Doret 13 | affiliation: Swiss Data Science Center 14 | orcid: 'https://orcid.org/0000-0002-1126-1535' 15 | - given-names: Sabine 16 | family-names: Maennel 17 | orcid: 'https://orcid.org/0009-0001-3022-8239' 18 | affiliation: Swiss Data Science Center 19 | - given-names: Robin 20 | family-names: Franken 21 | orcid: 'https://orcid.org/0009-0008-0143-9118' 22 | affiliation: Swiss Data Science Center 23 | - given-names: Martin 24 | family-names: Fontanet 25 | orcid: 'https://orcid.org/0000-0002-6441-8540' 26 | affiliation: Swiss Data Science Center 27 | - given-names: Laure 28 | family-names: Vancauwenberghe 29 | affiliation: Swiss Data Science Center 30 | - given-names: Stefan 31 | family-names: Milosavljevic 32 | email: supermegaiperste@hotmail.com 33 | affiliation: Swiss Data Science Center 34 | repository-code: 'https://github.com/sdsc-ordes/gimie' 35 | abstract: Extract linked metadata from repositories 36 | keywords: 37 | - git 38 | - cli 39 | - library 40 | - linked-open-data 41 | - metadata-extraction 42 | - fair-data 43 | - scientific-software 44 | license: Apache-2.0 45 | -------------------------------------------------------------------------------- /gimie/graph/operations.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Operations on graphs.""" 18 | from functools import reduce 19 | from typing import Set 20 | 21 | from rdflib import Graph 22 | from rdflib.term import URIRef 23 | 24 | from gimie.graph import Property 25 | 26 | 27 | def combine_graphs(*graphs: Graph) -> Graph: 28 | """Combines an arbitrary number of input graphs 29 | into a single graph.""" 30 | return reduce(lambda g1, g2: g1 | g2, graphs) 31 | 32 | 33 | def properties_to_graph(uri: URIRef, properties: Set[Property]) -> Graph: 34 | """Attaches a set of predicate-object tuples to input 35 | URI to produce an RDF graph.""" 36 | g = Graph() 37 | for pred, obj in properties: 38 | g.add((uri, pred, obj)) 39 | return g 40 | -------------------------------------------------------------------------------- /docs/intro/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quick start 2 | *********** 3 | 4 | The easiest way to use gimie is to run it as a command line tool. Here's how to get started: 5 | 6 | Install using pip or docker: 7 | 8 | .. tab-set:: 9 | 10 | .. tab-item:: pip 11 | :sync: pip 12 | :selected: 13 | 14 | .. code-block:: console 15 | 16 | pip install gimie 17 | 18 | .. tab-item:: docker 19 | :sync: docker 20 | 21 | .. code-block:: console 22 | 23 | docker pull ghcr.io/sdsc-ordes/gimie:latest 24 | 25 | 26 | .. warning:: 27 | 28 | Before running gimie, you will need to obtain a personal access token for the GitHub and/or GitLab and export it as an environment variable. See :ref:`Token management` for more information. 29 | 30 | 31 | Gimie can then be used as follows to extract repository metadata: 32 | 33 | .. tab-set:: 34 | 35 | .. tab-item:: pip 36 | :sync: pip 37 | :selected: 38 | 39 | .. code-block:: console 40 | :emphasize-text: 41 | 42 | gimie data > output.ttl 43 | 44 | .. tab-item:: docker 45 | :sync: docker 46 | 47 | .. code-block:: console 48 | :emphasize-text: 49 | 50 | docker run -e GITHUB_TOKEN=${GITHUB_TOKEN} ghcr.io/sdsc-ordes/gimie:latest data > output.ttl 51 | 52 | 53 | .. note:: 54 | 55 | When running gimie in a container, you need to pass your github or gitlab token as an environment variable inside the container: 56 | -------------------------------------------------------------------------------- /.docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG VERSION_BUILD 2 | 3 | FROM python:3.12-slim-bookworm AS python 4 | ENV PYTHONUNBUFFERED=true 5 | WORKDIR /app 6 | 7 | LABEL org.opencontainers.image.source=https://github.com/sdsc-ordes/gimie 8 | LABEL org.opencontainers.image.description="Extract linked metadata from repositories." 9 | LABEL org.opencontainers.image.licenses=Apache-2.0 10 | LABEL org.opencontainers.image.version=${VERSION_BUILD} 11 | 12 | ################################################## 13 | # Poetry setup 14 | ################################################## 15 | FROM python AS poetry 16 | 17 | # Install poetry 18 | ENV POETRY_HOME=/opt/poetry 19 | ENV POETRY_VIRTUALENVS_IN_PROJECT=true 20 | ENV PATH="$POETRY_HOME/bin:$PATH" 21 | RUN python -c 'from urllib.request import urlopen; print(urlopen("https://install.python-poetry.org").read().decode())' | python - 22 | 23 | # Copy necessary files only 24 | COPY gimie ./gimie 25 | COPY pyproject.toml ./pyproject.toml 26 | COPY poetry.lock ./poetry.lock 27 | COPY .env.dist ./.env.dist 28 | COPY README.md ./README.md 29 | RUN apt-get update && \ 30 | apt-get install -y gcc 31 | 32 | # Poetry install 33 | RUN poetry install --no-interaction --no-ansi -vvv 34 | 35 | 36 | ################################################## 37 | # Gimie setup 38 | ################################################## 39 | FROM python AS runtime 40 | ENV PATH="/app/.venv/bin:$PATH" 41 | RUN apt-get update && \ 42 | apt-get install -y git libgomp1 libmagic-dev 43 | COPY --from=poetry /app /app 44 | COPY ".docker/entrypoint.sh" "/entrypoint.sh" 45 | 46 | # Set user 47 | RUN useradd -ms /bin/bash gimie_user 48 | USER gimie_user 49 | 50 | # Test gimie 51 | RUN gimie --version 52 | 53 | # Set command and entrypoint 54 | CMD ["gimie"] 55 | ENTRYPOINT ["/entrypoint.sh"] 56 | -------------------------------------------------------------------------------- /.github/workflows/sphinx-docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | push: 4 | branches: [main] 5 | pull_request: 6 | paths: 7 | - 'docs/**' 8 | 9 | permissions: 10 | contents: write 11 | jobs: 12 | docs-build: 13 | runs-on: ubuntu-latest 14 | if: github.ref != 'refs/heads/main' 15 | steps: 16 | # https://github.com/actions/checkout 17 | - uses: actions/checkout@v4 18 | 19 | # https://github.com/actions/setup-python 20 | - uses: actions/setup-python@v4 21 | 22 | # https://github.com/snok/install-poetry 23 | - name: Install Poetry 24 | uses: snok/install-poetry@v1 25 | 26 | - name: Install dependencies 27 | run: | 28 | poetry install --with doc 29 | 30 | - name: Sphinx build 31 | run: | 32 | make doc 33 | 34 | docs-push: 35 | runs-on: ubuntu-latest 36 | if: github.ref == 'refs/heads/main' 37 | steps: 38 | # https://github.com/actions/checkout 39 | - uses: actions/checkout@v4 40 | 41 | # https://github.com/actions/setup-python 42 | - uses: actions/setup-python@v4 43 | 44 | # https://github.com/snok/install-poetry 45 | - name: Install Poetry 46 | uses: snok/install-poetry@v1 47 | 48 | - name: Install dependencies 49 | run: | 50 | poetry install --with doc 51 | 52 | - name: Sphinx build 53 | run: | 54 | make doc 55 | 56 | # https://github.com/peaceiris/actions-gh-pages 57 | - name: Deploy 58 | uses: peaceiris/actions-gh-pages@v3 59 | # if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/docs-website' }} 60 | with: 61 | publish_branch: gh-pages 62 | github_token: ${{ secrets.GITHUB_TOKEN }} 63 | publish_dir: docs/_build/ 64 | force_orphan: true 65 | -------------------------------------------------------------------------------- /.github/workflows/poetry-pytest.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [push, workflow_call] 4 | 5 | jobs: 6 | 7 | test: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.9", "3.10", "3.11", "3.12"] 12 | steps: 13 | # https://github.com/actions/checkout 14 | - uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | 18 | # https://github.com/actions/setup-python 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | # https://github.com/snok/install-poetry 25 | - name: Install Poetry 26 | uses: snok/install-poetry@v1 27 | 28 | - name: Install Dependencies 29 | run: poetry install 30 | if: steps.cache.outputs.cache-hit != 'true' 31 | 32 | - name: Code Quality 33 | run: poetry run black . --check 34 | 35 | - name: Test with pytest 36 | env: 37 | GITHUB_TOKEN: ${{ secrets.ACCESS_GITHUB_TOKEN }} 38 | GITLAB_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }} 39 | run: make test 40 | 41 | - name: Upload coverage report 42 | env: 43 | COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} 44 | GITHUB_TOKEN: ${{ secrets.ACCESS_GITHUB_TOKEN }} 45 | GITLAB_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }} 46 | COVERALLS_PARALLEL: true 47 | run: | 48 | pip install coveralls 49 | coveralls --service=github-actions 50 | continue-on-error: true 51 | 52 | finish: 53 | needs: test 54 | if: ${{ always() }} 55 | runs-on: ubuntu-latest 56 | steps: 57 | # https://github.com/coverallsapp/github-action 58 | - name: Coveralls Finished 59 | uses: coverallsapp/github-action@v2 60 | with: 61 | parallel-finished: true 62 | -------------------------------------------------------------------------------- /gimie/parsers/abstract.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from abc import ABC, abstractmethod 18 | from functools import reduce 19 | from typing import Iterable, Set 20 | from rdflib import Graph, URIRef 21 | from gimie.graph import Property 22 | 23 | 24 | class Parser(ABC): 25 | """ 26 | Parser is an Abstract Base Class. It is only meant 27 | to define a standard interface for all parsers. 28 | 29 | All subclasses must implement parse(). A parser parses 30 | bytes data into a set of predicate-object tuples. 31 | 32 | Parameters 33 | ---------- 34 | subject: 35 | The subject of a triple (subject - predicate - object) to be used for writing parsed properties to. 36 | """ 37 | 38 | def __init__(self, subject: str): 39 | self.subject = URIRef(subject) 40 | 41 | @abstractmethod 42 | def parse(self, data: bytes) -> Graph: 43 | """Extract rdf graph from a source.""" 44 | ... 45 | 46 | def parse_all(self, docs: Iterable[bytes]) -> Graph: 47 | """Parse multiple sources and return the union of 48 | triples.""" 49 | 50 | properties = map(self.parse, docs) 51 | return reduce(lambda p1, p2: p1 | p2, properties) 52 | -------------------------------------------------------------------------------- /scripts/generate_tfidf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Download all SPDX licenses and fit a tf-idf vectorizer to them. 3 | The tf-idf matrix, vectorizer and license list are then saved to disk.""" 4 | 5 | import json 6 | from pathlib import Path 7 | from typing import List, NamedTuple 8 | 9 | import numpy as np 10 | import scipy.sparse as sp 11 | import requests 12 | 13 | from gimie.utils.text import TfidfConfig, TfidfVectorizer 14 | 15 | OUT_DIR = Path("gimie") / "parsers" / "license" / "data" 16 | 17 | # Retrieve metadata for all OSI approved and valid licenses from SPDX 18 | SPDX_LIST_URL = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" 19 | all_licenses = requests.get(SPDX_LIST_URL).json()["licenses"] 20 | licenses = filter(lambda l: l["isOsiApproved"], all_licenses) 21 | licenses = filter(lambda l: not l["isDeprecatedLicenseId"], licenses) 22 | licenses = list(licenses) 23 | 24 | # Assemble corpus of license texts (this takes a while) 25 | class License(NamedTuple): 26 | license_id: str 27 | text: str 28 | 29 | 30 | corpus: List[License] = [] 31 | 32 | for idx, license in enumerate(licenses): 33 | resp = requests.get(license["detailsUrl"]) 34 | if not resp.ok: 35 | continue 36 | text = resp.json()["licenseText"] 37 | corpus.append(License(license["licenseId"], text)) 38 | 39 | # Fit tfidf vectorizer to corpus 40 | texts = [l.text for l in corpus] 41 | vectorizer = TfidfVectorizer( 42 | config=TfidfConfig( 43 | max_features=700, ngram_range=(1, 2), sublinear_tf=True, norm="l2" 44 | ) 45 | ) 46 | tfidf = vectorizer.fit_transform(texts) 47 | 48 | # Save vectorizer and tfidf matrix 49 | with open(OUT_DIR / "tfidf_vectorizer.json", "w") as fp: 50 | fp.write(vectorizer.model_dump_json()) 51 | # Prune precision to reduce size 52 | tfidf.data = tfidf.data.astype(np.float16) 53 | sp.save_npz(OUT_DIR / "tfidf_matrix.npz", tfidf) 54 | with open(OUT_DIR / "spdx_licenses.csv", "w") as fp: 55 | for l in corpus: 56 | fp.write(f"{l.license_id},{len(l.text)}\n") 57 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "gimie" 10 | copyright = "2023, sdsc-ordes" 11 | author = "sdsc-ordes" 12 | release = "0.7.2" 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | # Add any Sphinx extension module names here, as strings. They can be 18 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 19 | # ones. 20 | extensions = [ 21 | "sphinx.ext.napoleon", 22 | "sphinx.ext.autodoc", 23 | "sphinx.ext.doctest", 24 | "sphinx.ext.intersphinx", 25 | "sphinx.ext.coverage", 26 | "sphinx.ext.viewcode", 27 | "sphinx.ext.githubpages", 28 | "sphinx.ext.autosectionlabel", 29 | "sphinx_click", 30 | "sphinx_copybutton", 31 | "sphinx_design", 32 | "myst_parser", 33 | "sphinxawesome_theme.highlighting", 34 | ] 35 | 36 | templates_path = ["_templates"] 37 | 38 | source_suffix = { 39 | ".rst": "restructuredtext", 40 | ".md": "markdown", 41 | } 42 | 43 | 44 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 45 | 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 49 | 50 | html_theme = "sphinxawesome_theme" 51 | html_static_path = ["_static"] 52 | html_logo = "logo_notext.svg" 53 | html_favicon = "favicon.ico" 54 | 55 | 56 | # -- Extension configuration ------------------------------------------------- 57 | 58 | # Options for intersphinx 59 | 60 | intersphinx_mapping = { 61 | "python": ("https://docs.python.org/", None), 62 | "rdflib": ("https://rdflib.readthedocs.io/en/stable/", None), 63 | "calamus": ("https://calamus.readthedocs.io/en/latest/", None), 64 | } 65 | -------------------------------------------------------------------------------- /.github/workflows/poetry-publish.yml: -------------------------------------------------------------------------------- 1 | # Workflow following resources at: 2 | # - https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-pypi 3 | # - https://packaging.python.org/en/latest/tutorials/packaging-projects/#uploading-the-distribution-archives 4 | # Jobs are split to prevent unneccessary priviledge elevation through write permissions during building. 5 | 6 | name: Build and publish on Pypi 7 | 8 | on: 9 | release: 10 | types: [published] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | run-tests: 17 | uses: ./.github/workflows/poetry-pytest.yml 18 | secrets: inherit 19 | 20 | release-build: 21 | name: Build python wheels 22 | needs: 23 | - run-tests 24 | runs-on: ubuntu-latest 25 | steps: 26 | # https://github.com/actions/checkout 27 | - uses: actions/checkout@v4 28 | - name: Set up Python 29 | # https://github.com/actions/setup-python 30 | uses: actions/setup-python@v5.1.1 31 | with: 32 | python-version: "3.12" 33 | 34 | - name: Install Poetry 35 | run: | 36 | pip install poetry 37 | 38 | - name: Build source and wheel archives 39 | run: poetry build 40 | 41 | - name: Upload distributions 42 | # https://github.com/actions/upload-artifact 43 | uses: actions/upload-artifact@v4 44 | with: 45 | name: release-dists 46 | path: dist/ 47 | 48 | pypi-publish: 49 | name: Upload release to PyPI 50 | needs: 51 | - release-build 52 | runs-on: ubuntu-latest 53 | environment: 54 | name: pypi 55 | url: https://pypi.org/p/gimie 56 | permissions: 57 | id-token: write 58 | # IMPORTANT: this permission is mandatory for trusted publishing 59 | steps: 60 | - name: Retrieve release distributions 61 | # https://github.com/actions/download-artifact 62 | uses: actions/download-artifact@v4.1.8 63 | with: 64 | name: release-dists 65 | path: dist/ 66 | - name: Publish package distributions to PyPI 67 | # https://github.com/pypa/gh-action-pypi-publish 68 | uses: pypa/gh-action-pypi-publish@release/v1 69 | -------------------------------------------------------------------------------- /.github/workflows/poetry-test-publish.yml: -------------------------------------------------------------------------------- 1 | # Workflow following resources at: 2 | # - https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-pypi 3 | # - https://packaging.python.org/en/latest/tutorials/packaging-projects/#uploading-the-distribution-archives 4 | # Jobs are split to prevent unneccessary priviledge elevation through write permissions during building. 5 | 6 | name: Build and publish on Pypi Test 7 | 8 | on: 9 | workflow_dispatch: 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | run-tests: 16 | uses: ./.github/workflows/poetry-pytest.yml 17 | secrets: inherit 18 | 19 | test-build: 20 | name: Build python wheels 21 | needs: 22 | - run-tests 23 | runs-on: ubuntu-latest 24 | steps: 25 | # https://github.com/actions/checkout 26 | - uses: actions/checkout@v4 27 | - name: Set up Python 28 | # https://github.com/actions/setup-python 29 | uses: actions/setup-python@v5.1.1 30 | with: 31 | python-version: "3.12" 32 | 33 | - name: Install Poetry 34 | run: | 35 | pip install poetry 36 | 37 | - name: Build source and wheel archives 38 | run: poetry build 39 | 40 | - name: Upload distributions 41 | # https://github.com/actions/upload-artifact 42 | uses: actions/upload-artifact@v4 43 | with: 44 | name: test-dists 45 | path: dist/ 46 | 47 | pypi-test-publish: 48 | name: Upload release to PyPI Test 49 | needs: 50 | - test-build 51 | runs-on: ubuntu-latest 52 | environment: 53 | name: test-pypi 54 | url: https://test.pypi.org/p/gimie 55 | permissions: 56 | id-token: write 57 | # IMPORTANT: this permission is mandatory for trusted publishing 58 | steps: 59 | - name: Retrieve release distributions 60 | # https://github.com/actions/download-artifact 61 | uses: actions/download-artifact@v4.1.8 62 | with: 63 | name: test-dists 64 | path: dist/ 65 | - name: Publish package distributions to TestPyPI 66 | # https://github.com/pypa/gh-action-pypi-publish 67 | uses: pypa/gh-action-pypi-publish@release/v1 68 | with: 69 | repository-url: https://test.pypi.org/legacy/ 70 | -------------------------------------------------------------------------------- /tests/test_tfidf.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from gimie.utils.text_processing import TfidfConfig, TfidfVectorizer 8 | 9 | CORPUS = [ 10 | "This is my test document.", 11 | "This is another test document.", 12 | ] 13 | 14 | 15 | @pytest.fixture 16 | def tfidf_vectorizer() -> TfidfVectorizer: 17 | """Fixture for a TfidfVectorizer instance.""" 18 | config = TfidfConfig(norm="l2", sublinear_tf=True) 19 | return TfidfVectorizer(config=config) 20 | 21 | 22 | def test_tfidf_serde(tfidf_vectorizer: TfidfVectorizer): 23 | """Test json serialization and deserialization of TfidfVectorizer.""" 24 | json_str = tfidf_vectorizer.model_dump_json(indent=2) 25 | json.loads(json_str) 26 | print(TfidfVectorizer.model_validate_json(json_str)) 27 | 28 | 29 | def test_tfidf_fit_transform(tfidf_vectorizer: TfidfVectorizer): 30 | """Test correctness of tfidf fit.""" 31 | _ = tfidf_vectorizer.fit_transform(CORPUS) 32 | # targets computed using sklearn 1.2.2 33 | target_voc = { 34 | "another": 0, 35 | "document": 1, 36 | "is": 2, 37 | "my": 3, 38 | "test": 4, 39 | "this": 5, 40 | } 41 | target_idf = np.array( 42 | [1.4054651081081644, 1.0, 1.0, 1.4054651081081644, 1.0, 1.0] 43 | ) 44 | assert all( 45 | [v == target_voc[t] for t, v in tfidf_vectorizer.vocabulary.items()] 46 | ) 47 | pred_idf: List[float] = tfidf_vectorizer.idf_vector 48 | assert all([pred == target for pred, target in zip(pred_idf, target_idf)]) 49 | 50 | 51 | # Test fitting different configurations 52 | @pytest.mark.parametrize( 53 | "config", 54 | [ 55 | TfidfConfig(), 56 | TfidfConfig(max_features=10), 57 | TfidfConfig(ngram_range=(1, 2)), 58 | TfidfConfig(ngram_range=(2, 2)), 59 | TfidfConfig(smooth_idf=False), 60 | TfidfConfig(norm="l1"), 61 | TfidfConfig(norm="l2"), 62 | TfidfConfig(sublinear_tf=True), 63 | TfidfConfig(vocabulary={"this": 0, "is": 1, "test": 2}), 64 | ], 65 | ) 66 | def test_tfidf_configs(config): 67 | """Test fitting different configurations.""" 68 | vectorizer = TfidfVectorizer(config=config) 69 | _ = vectorizer.fit_transform(CORPUS) 70 | -------------------------------------------------------------------------------- /tests/test_git.py: -------------------------------------------------------------------------------- 1 | """Tests for the Gimie command line interface.""" 2 | 3 | import os 4 | import datetime 5 | 6 | import pytest 7 | 8 | from gimie.io import LocalResource 9 | from gimie.extractors.git import GitExtractor 10 | from gimie.project import Project 11 | 12 | LOCAL_REPOSITORY = os.getcwd() 13 | RENKU_GITHUB = "https://github.com/SwissDataScienceCenter/renku" 14 | UNSUPPORTED_PROV = "https://codeberg.org/dnkl/foot" 15 | 16 | 17 | @pytest.fixture 18 | def local_meta(): 19 | """Return metadata for a local repository.""" 20 | extractor = GitExtractor( 21 | "https://github.com/sdsc-ordes/gimie", local_path=LOCAL_REPOSITORY 22 | ) 23 | return extractor.extract() 24 | 25 | 26 | def test_git_authors(local_meta): 27 | """Test part of the authors returned by gimie.""" 28 | contribs = [c.name for c in local_meta.contributors] 29 | author = local_meta.authors[0] 30 | names = [ 31 | "cmdoret", 32 | "Martin Nathan Tristan Fontanet", 33 | "rmfranken", 34 | "sabrinaossey", 35 | ] 36 | assert all([n in contribs for n in names]) 37 | assert author.name == "Cyril Matthey-Doret" 38 | 39 | 40 | def test_git_creation_date(local_meta): 41 | """Test the creation date of a git repository.""" 42 | assert local_meta.date_created.astimezone( 43 | datetime.timezone.utc 44 | ) == datetime.datetime( 45 | 2022, 12, 7, 10, 19, 31, tzinfo=datetime.timezone.utc 46 | ) 47 | 48 | 49 | def test_set_uri(): 50 | meta = GitExtractor( 51 | "https://example.com/test", local_path=LOCAL_REPOSITORY 52 | ).extract() 53 | assert meta._id == "https://example.com/test" 54 | 55 | 56 | def test_clone_extract_github(): 57 | """Clone Git repository by setting git extractor 58 | explicitely and extract metadata locally.""" 59 | proj = Project(RENKU_GITHUB, git_provider="git") 60 | assert type(proj.extractor) == GitExtractor 61 | proj.extract() 62 | 63 | 64 | def test_clone_unsupported(): 65 | """Instantiate Project from unsupported provider 66 | with git as default provider""" 67 | proj = Project(UNSUPPORTED_PROV) 68 | assert type(proj.extractor) == GitExtractor 69 | proj.extract() 70 | 71 | 72 | def test_git_list_files(): 73 | files = GitExtractor(UNSUPPORTED_PROV).list_files() 74 | assert all(isinstance(f, LocalResource) for f in files) 75 | -------------------------------------------------------------------------------- /gimie/extractors/abstract.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Abstract for Git repository extractors.""" 18 | from abc import ABC, abstractmethod 19 | from typing import List, Optional 20 | 21 | from urllib.parse import urlparse 22 | 23 | from gimie.io import Resource 24 | from gimie.models import Repository 25 | 26 | 27 | class Extractor(ABC): 28 | """Extractor is an Abstract Base Class. It is only meant 29 | to define a standard interface for all git repository extractors. 30 | 31 | Subclasses for different git providers must implement 32 | extract() and list_files() methods. 33 | """ 34 | 35 | def __init__( 36 | self, 37 | url: str, 38 | base_url: Optional[str] = None, 39 | local_path: Optional[str] = None, 40 | ): 41 | self.url = url 42 | self.base_url = base_url 43 | self.local_path = local_path 44 | 45 | @abstractmethod 46 | def extract(self) -> Repository: 47 | """Extract metadata from the git provider into a Repository object.""" 48 | ... 49 | 50 | @abstractmethod 51 | def list_files(self) -> List[Resource]: 52 | """List all files in the repository HEAD.""" 53 | ... 54 | 55 | @property 56 | def path(self) -> str: 57 | """Path to the repository without the base URL.""" 58 | if self.base_url is None: 59 | return urlparse(self.url).path.strip("/") 60 | return self.url.removeprefix(self.base_url).strip("/") 61 | 62 | @property 63 | def base(self) -> str: 64 | """Base URL of the remote.""" 65 | if self.base_url is None: 66 | url = urlparse(self.url) 67 | return f"{url.scheme}://{url.netloc}" 68 | return self.base_url 69 | -------------------------------------------------------------------------------- /tests/test_cff.py: -------------------------------------------------------------------------------- 1 | from gimie.io import LocalResource 2 | from gimie.parsers import CffParser 3 | from gimie.parsers.cff import get_cff_authors 4 | from rdflib import URIRef, Literal 5 | import pytest 6 | 7 | 8 | def test_parse_cff(): 9 | cff_file = LocalResource("CITATION.cff") 10 | with open(cff_file.path, "rb") as f: 11 | cff_content = f.read() 12 | authors = get_cff_authors(cff_content) 13 | assert authors is not None 14 | 15 | 16 | @pytest.mark.parametrize( 17 | "cff_file", 18 | [ 19 | ( 20 | b""" 21 | cff-version: 1.2.0 22 | message: "This is a CFF devoid authors or DOI" 23 | """ 24 | ), 25 | ( 26 | b""" 27 | cff-version: 1.2.0 28 | title: gimie : 29 | authors: 30 | family-names: Doe 31 | given-names: John 32 | - family-names: Smith 33 | given-names: 34 | Jane 35 | orcid: 0000-0001-2345-6789 36 | """ 37 | ), 38 | ( 39 | b""" 40 | cff-version: 1.2.0 41 | title: gimie 42 | authors: 43 | - family-names: Doe 44 | given-names: John 45 | orcid: 0000-0001-2345-6789 46 | - family-names: Smith 47 | given-names: Jane 48 | orcid: http://www.orcid.org/0000-0001-2345-6789 49 | """ 50 | ), 51 | ( 52 | b""" 53 | cff-version: 1.2.0 54 | title: gimie 55 | authors: 56 | - family-names: Doe 57 | given-names: John 58 | """ 59 | ), 60 | ], 61 | ) 62 | def test_broken_cff(cff_file): 63 | assert ( 64 | len( 65 | CffParser(subject=URIRef("https://example.org/")).parse( 66 | data=cff_file 67 | ) 68 | ) 69 | == 0 70 | ) 71 | 72 | 73 | def test_parse_doi(): 74 | cff_file = b""" 75 | cff-version: 1.2.0 76 | message: If you use this software, please cite it using these metadata. 77 | title: 'napari: a multi-dimensional image viewer for Python' 78 | identifiers: 79 | - type: doi 80 | value: 10.5281/zenodo.3555620 81 | - type: doi 82 | value: 10.21105/joss.01274 83 | """ 84 | parsed_dois = list( 85 | CffParser(subject=URIRef("https://example.org/")) 86 | .parse(data=cff_file) 87 | .objects() 88 | ) 89 | expected_dois = [ 90 | URIRef("https://doi.org/10.5281/zenodo.3555620"), 91 | URIRef("https://doi.org/10.21105/joss.01274"), 92 | ] 93 | # parsed_dois already contains all parsed DOI objects 94 | for doi in expected_dois: 95 | assert doi in parsed_dois 96 | -------------------------------------------------------------------------------- /gimie/extractors/common/queries.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import requests 18 | from typing import Any, Dict, List, Union 19 | 20 | 21 | def send_rest_query( 22 | api: str, query: str, headers: Dict[str, str] 23 | ) -> Union[List[Dict[str, Any]], Dict[str, Any]]: 24 | """Generic function to send a query to the GitHub/GitLab rest API.""" 25 | resp = requests.get( 26 | url=f"{api}/{query}", 27 | headers=headers, 28 | ) 29 | 30 | if resp.status_code != 200: 31 | error_msg = resp.json().get("message", "") 32 | if "API rate limit exceeded" in error_msg: 33 | raise ConnectionError( 34 | "Authentication failed: API rate limit exceeded. Please check that you have added " 35 | "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables." 36 | ) 37 | raise ConnectionError(f"API request failed: {error_msg}") 38 | return resp.json() 39 | 40 | 41 | def send_graphql_query( 42 | api: str, query: str, data: Dict[str, Any], headers: Dict[str, str] 43 | ) -> Dict[str, Any]: 44 | """Generic function to send a GraphQL query to the GitHub/GitLab API.""" 45 | resp = requests.post( 46 | url=f"{api}/graphql", 47 | json={ 48 | "query": query, 49 | "variables": data, 50 | }, 51 | headers=headers, 52 | ) 53 | 54 | if resp.status_code != 200: 55 | error_msg = resp.json().get("message", "") 56 | if "API rate limit exceeded" in error_msg: 57 | raise ConnectionError( 58 | "Authentication failed: API rate limit exceeded. Please check that you have added " 59 | "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables." 60 | ) 61 | raise ConnectionError(f"API request failed: {error_msg}") 62 | return resp.json() 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # apidoc generated docs 2 | docs/api 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .DS_Store 108 | **/.DS_Store 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # PyCharm 136 | .idea/ 137 | 138 | # Vscode 139 | .vscode/ 140 | .devcontainer.json 141 | .devcontainer/ 142 | -------------------------------------------------------------------------------- /gimie/parsers/license/data/spdx_licenses.csv: -------------------------------------------------------------------------------- 1 | 0BSD,643 2 | AAL,2529 3 | AFL-1.1,4676 4 | AFL-1.2,4950 5 | AFL-2.0,8986 6 | AFL-2.1,8947 7 | AFL-3.0,10315 8 | AGPL-3.0-only,34020 9 | AGPL-3.0-or-later,34020 10 | Apache-1.1,2514 11 | Apache-2.0,10280 12 | APL-1.0,46065 13 | APSL-1.0,19644 14 | APSL-1.1,20151 15 | APSL-1.2,19796 16 | APSL-2.0,20281 17 | Artistic-1.0,4854 18 | Artistic-1.0-cl8,5184 19 | Artistic-1.0-Perl,6060 20 | Artistic-2.0,8764 21 | BSD-1-Clause,1086 22 | BSD-2-Clause,1267 23 | BSD-2-Clause-Patent,2569 24 | BSD-3-Clause,1460 25 | BSD-3-Clause-LBNL,2388 26 | BSL-1.0,1338 27 | CAL-1.0,16121 28 | CAL-1.0-Combined-Work-Exception,16121 29 | CATOSL-1.1,19046 30 | CDDL-1.0,16419 31 | CECILL-2.1,21774 32 | CERN-OHL-P-2.0,8855 33 | CERN-OHL-S-2.0,13419 34 | CERN-OHL-W-2.0,14529 35 | CNRI-Python,3381 36 | CPAL-1.0,28141 37 | CPL-1.0,11653 38 | CUA-OPL-1.0,23381 39 | ECL-1.0,2425 40 | ECL-2.0,11111 41 | EFL-1.0,919 42 | EFL-2.0,924 43 | Entessa,2277 44 | EPL-1.0,11345 45 | EPL-2.0,13946 46 | EUDatagrid,3195 47 | EUPL-1.1,13231 48 | EUPL-1.2,13648 49 | Fair,245 50 | Frameworx-1.0,9771 51 | GPL-2.0-only,17337 52 | GPL-2.0-or-later,17337 53 | GPL-3.0-only,34509 54 | GPL-3.0-or-later,34509 55 | HPND,1187 56 | ICU,1597 57 | Intel,2078 58 | IPA,9093 59 | IPL-1.0,11409 60 | ISC,823 61 | Jam,195 62 | LGPL-2.0-only,24842 63 | LGPL-2.0-or-later,24842 64 | LGPL-2.1-only,25967 65 | LGPL-2.1-or-later,25967 66 | LGPL-3.0-only,41933 67 | LGPL-3.0-or-later,41933 68 | LiLiQ-P-1.1,6351 69 | LiLiQ-R-1.1,8392 70 | LiLiQ-Rplus-1.1,8043 71 | LPL-1.0,11948 72 | LPL-1.02,11824 73 | LPPL-1.3c,18575 74 | MirOS,888 75 | MIT,1078 76 | MIT-0,915 77 | MIT-Modern-Variant,917 78 | Motosoto,20187 79 | MPL-1.0,18272 80 | MPL-1.1,23669 81 | MPL-2.0,16727 82 | MPL-2.0-no-copyleft-exception,16727 83 | MS-PL,2663 84 | MS-RL,3058 85 | MulanPSL-2.0,6850 86 | Multics,2040 87 | NASA-1.3,13778 88 | Naumen,1953 89 | NCSA,1700 90 | NGPL,4703 91 | Nokia,21002 92 | NPOSL-3.0,11799 93 | NTP,714 94 | OCLC-2.0,11121 95 | OFL-1.1,4012 96 | OFL-1.1-no-RFN,4012 97 | OFL-1.1-RFN,4012 98 | OGTSL,5277 99 | OLDAP-2.8,2195 100 | OLFL-1.3,11401 101 | OSET-PL-2.1,19843 102 | OSL-1.0,8920 103 | OSL-2.0,9880 104 | OSL-2.1,9871 105 | OSL-3.0,10309 106 | PHP-3.0,2846 107 | PHP-3.01,2855 108 | PostgreSQL,1195 109 | Python-2.0,9411 110 | QPL-1.0,4364 111 | RPL-1.1,33931 112 | RPL-1.5,32009 113 | RPSL-1.0,30267 114 | RSCPL,21050 115 | SimPL-2.0,2529 116 | SISSL,14490 117 | Sleepycat,4995 118 | SPL-1.0,23398 119 | UCL-1.0,10556 120 | Unicode-DFS-2016,2857 121 | Unlicense,1211 122 | UPL-1.0,1833 123 | VSL-1.0,2065 124 | W3C,2701 125 | Watcom-1.0,20968 126 | Xnet,1250 127 | Zlib,838 128 | ZPL-2.0,2275 129 | ZPL-2.1,2100 130 | -------------------------------------------------------------------------------- /docs/intro/usage_python.rst: -------------------------------------------------------------------------------- 1 | Python Usage 2 | ************ 3 | 4 | Gimie can be used as a python library. Either to run the end-to-end extraction process on an input URL, or only a specific extractor. 5 | 6 | The end-to-end extraction is performed by ``gimie.Project`` and will automatically detect the git-provider and return directly an `rdflib.Graph` object. After extracting data from the git repository, parsers are executed on the files contents to enrich the graph with additional information.: 7 | 8 | .. code-block:: python 9 | 10 | from gimie.project import Project 11 | url = 'https://github.com/apache/pulsar' 12 | proj = Project(url) 13 | g = proj.extract() 14 | 15 | 16 | A specific extractor can also be used, for example to use with GitLab projects: 17 | 18 | .. code-block:: python 19 | 20 | from gimie.extractors import GitlabExtractor 21 | url = "https://gitlab.com/data-custodian/custodian" 22 | extractor = GitlabExtractor(url) 23 | repo = extractor.extract() 24 | 25 | 26 | Unlike `Project`, extractors only extract data from the git repository without running any parser, and return a `Repository` object. 27 | 28 | The `Repository` object can be serialized to RDF or converted to an rdflib graph: 29 | 30 | .. code-block:: python 31 | 32 | type(repo) 33 | # gimie.models.Repository 34 | repo.name 35 | # 'data-custodian/custodian' 36 | repo.prog_langs 37 | # ['Go', 'Dockerfile', 'Smarty', 'Shell', 'Makefile'] 38 | repo.serialize(format='json-ld', destination='custodian.json') 39 | g = repo.to_graph() 40 | type(g) 41 | # rdflib.graph.Graph 42 | 43 | Extractors also have a `list_files()` method which provides handles to a streamable file-like interface for files in the root of the repository. 44 | 45 | .. code-block:: python 46 | 47 | handles = extractor.list_files() 48 | readme_handle = handles[11] 49 | readme_handle.path 50 | # PosixPath('README.md') 51 | readme_handle.open().readlines()[:2] 52 | # [b'# The Swiss Data Custodian\n', b'\n'] 53 | 54 | 55 | Parsers can also be run manually on the files contents: 56 | 57 | 58 | .. code-block:: python 59 | 60 | from gimie.parsers import LicenseParser 61 | parser = LicenseParser() 62 | license_handle = handles[8] 63 | license_contents = license_handle.open().read() 64 | parser.parse(license_contents) 65 | # {(rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://spdx.org/licenses/AGPL-3.0-only.html'))} 66 | 67 | 68 | There is also a helper function to run parsers on a list of files, 69 | selecting the correct parser based on file names: 70 | 71 | .. code-block:: python 72 | 73 | from gimie.parsers import parse_files 74 | parse_files(handles) 75 | # {(rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://spdx.org/licenses/AGPL-3.0-only.html'))} 76 | -------------------------------------------------------------------------------- /gimie/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Git providers from which metadata can be extracted by gimie.""" 19 | from typing import Dict, Optional, Type 20 | from gimie.extractors.abstract import Extractor 21 | from gimie.extractors.github import GithubExtractor 22 | from gimie.extractors.gitlab import GitlabExtractor 23 | from gimie.extractors.git import GitExtractor 24 | from gimie.utils.uri import validate_url 25 | 26 | GIT_PROVIDERS: Dict[str, Type[Extractor]] = { 27 | "git": GitExtractor, 28 | "github": GithubExtractor, 29 | "gitlab": GitlabExtractor, 30 | } 31 | 32 | 33 | def get_extractor( 34 | url: str, 35 | source: str, 36 | base_url: Optional[str] = None, 37 | local_path: Optional[str] = None, 38 | ) -> Extractor: 39 | """Instantiate the correct extractor for a given source. 40 | 41 | Parameters 42 | ----------- 43 | URL 44 | Where the repository metadata is extracted from. 45 | source 46 | The source of the repository (git, gitlab, github, ...). 47 | base_url 48 | The base URL of the git remote. 49 | local_path 50 | If applicable, the path to the directory where the 51 | repository is located. 52 | 53 | Examples 54 | -------- 55 | >>> extractor = get_extractor( 56 | ... "https://github.com/sdsc-ordes/gimie", 57 | ... "github" 58 | ... ) 59 | """ 60 | try: 61 | return GIT_PROVIDERS[source]( 62 | url, base_url=base_url, local_path=local_path 63 | ) 64 | except KeyError as err: 65 | raise ValueError( 66 | f"Unknown git provider: {source}.\n" 67 | f"Supported sources: {', '.join(GIT_PROVIDERS)}" 68 | ) from err 69 | 70 | 71 | def infer_git_provider(url: str) -> str: 72 | """Given a git repository URL, return the corresponding git provider. 73 | Local path or unsupported git providers will return "git". 74 | 75 | Examples 76 | -------- 77 | >>> infer_git_provider("https://gitlab.com/foo/bar") 78 | 'gitlab' 79 | >>> infer_git_provider("/foo/bar") 80 | 'git' 81 | >>> infer_git_provider("https://codeberg.org/dnkl/foot") 82 | 'git' 83 | """ 84 | # Fall back to git if local path 85 | if not validate_url(url): 86 | return "git" 87 | 88 | # NOTE: We just check if the provider name is in the URL. 89 | # We may want to use a more robust check. 90 | for name in GIT_PROVIDERS.keys(): 91 | if name in url and name != "git": 92 | return name 93 | 94 | # Fall back to git for unsupported providers 95 | return "git" 96 | -------------------------------------------------------------------------------- /docs/logo_notext.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 41 | 43 | 49 | 50 | 55 | 60 | 67 | 74 | 81 | 86 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /docs/intro/tokens.rst: -------------------------------------------------------------------------------- 1 | Token management 2 | **************** 3 | 4 | Gimie requests data from third party APIs (Gitlab, Github) which require authentication to work. This authentication usually works with Personal Authentication Tokens (PATs). PATs are secret codes that can be used as passwords to perform actions on your behalf, but whose permissions can be limited to specific actions. Since Gimie only consumes data, it will normally work with tokens that have read-only permission. 5 | 6 | Generating tokens can usually be done via the web interface of the service provider, and they must then be provided to Gimie. There are 2 ways to pass your token to Gimie: 7 | 8 | 1. Set the corresponding Environment variable. The token will only be accessible for the current session: 9 | 10 | 11 | .. tab-set:: 12 | 13 | .. tab-item:: Linux/Mac/BSD 14 | :selected: 15 | 16 | .. code-block:: console 17 | :emphasize-text: 18 | 19 | export GITLAB_TOKEN= 20 | export GITHUB_TOKEN= 21 | 22 | .. tab-item:: Windows 23 | 24 | .. code-block:: console 25 | :emphasize-text: 26 | 27 | # You may need to restart windows after this 28 | setx GITLAB_TOKEN 29 | setx GITHUB_TOKEN 30 | 31 | 32 | 2. Use a ``.env`` file in the current directory. Gimie will look for a file named ``.env`` and source it. The file contents should be as follows: 33 | 34 | .. code-block:: 35 | :emphasize-text: 36 | :caption: File: .env 37 | 38 | GITLAB_TOKEN= 39 | GITHUB_TOKEN= 40 | 41 | 42 | While the latter approach can be convenient to persist your token locally, it is generally not recommended to store your tokens in plain text as they are sensitive information. Hence the first approach should be preferred in most cases. 43 | 44 | Encrypting tokens 45 | ================= 46 | 47 | If you are serious about security, you should use a tool like `sops `_ or `pass `_ to encrypt your secrets. 48 | 49 | Below is a quick guide on how to use ``sops`` to store encrypted tokens, and decrypt them on the fly when using gimie. 50 | 51 | .. dropdown:: Generating PGP key 52 | 53 | PGP is a public key encryption system. If you don't already have one, you will need to generate a key pair to encrypt your secrets. 54 | You can use the following command to generate a key pair. You will be prompted for a passphrase, but you may leave it empty if you wish. 55 | 56 | .. code-block:: bash 57 | 58 | gpg --gen-key 59 | 60 | .. dropdown:: Set up SOPS 61 | 62 | SOPS needs to be configured to use your PGP key. You can do so by running the following command: 63 | Replace ```` with the fingerprint of your PGP key (it looks like ``69AB B75E ...``). You can find it by running ``gpg --fingerprint`` 64 | Upon running the command below, `sops` will open a `vim` buffer where you can enter the desired content of your .env file. 65 | Upon saving the file (``:wq``), ``sops`` will encrypt the file and save it as ``.enc.env``. 66 | 67 | .. code-block:: bash 68 | 69 | sops --pgp "${FINGERPRINT}" .enc.env 70 | 71 | .. dropdown:: Source tokens 72 | 73 | Whenever you want to run gimie, you can decrypt secrets on the fly and pass them to gimie using the following command: 74 | 75 | .. code-block:: bash 76 | :emphasize-text: 77 | 78 | sops exec-env .enc.env 'gimie data ' 79 | 80 | Or if you just want to inspect the decrypted file: 81 | 82 | .. code-block:: bash 83 | 84 | sops --decrypt .enc.env 85 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Package description 2 | 3 | [tool.poetry] 4 | name = "gimie" 5 | version = "0.7.2" 6 | description = "Extract structured metadata from git repositories." 7 | authors = ["Swiss Data Science Center "] 8 | license = "Apache-2.0" 9 | homepage = "https://github.com/sdsc-ordes/gimie" 10 | keywords = ["metadata", "git", "extraction", "linked-data"] 11 | readme = "README.md" 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: 3.10", 16 | "Programming Language :: Python :: 3.11", 17 | "Programming Language :: Python :: 3.12", 18 | "Intended Audience :: Science/Research", 19 | "Intended Audience :: Developers", 20 | "License :: OSI Approved :: Apache Software License", 21 | "Operating System :: OS Independent", 22 | ] 23 | 24 | # Dependency management 25 | 26 | [tool.poetry.dependencies] 27 | python = ">=3.9,<4.0" 28 | gitpython = ">=3.1.35" 29 | PyDriller = "^2.5" 30 | typer = "^0.7.0" 31 | calamus = "^0.4.2" 32 | requests = "^2.28.2" 33 | python-dotenv = "^0.21.1" 34 | python-dateutil = "^2.8.2" 35 | spdx-license-list = "^3.22" 36 | numpy = "^1.26.1" 37 | pydantic = "^2.4.2" 38 | scipy = "^1.11.3" 39 | pyyaml = "^6.0.2" 40 | 41 | [tool.poetry.group.dev.dependencies] 42 | black = "^22.10.0" 43 | coveralls = "^3.3.1" 44 | pre-commit = "^3.0.0" 45 | pytest = "^7.2.0" 46 | pytest-cov = "^4.1.0" 47 | 48 | 49 | [tool.poetry.group.doc.dependencies] 50 | sphinx = "<7.0.0" 51 | sphinx-click = "^4.4.0" 52 | sphinxawesome-theme = "^4.1.0" 53 | sphinx-copybutton = "^0.5.2" 54 | sphinx-design = "^0.4.1" 55 | myst-parser = "^1.0.0" 56 | 57 | [build-system] 58 | requires = ["poetry-core"] 59 | build-backend = "poetry.core.masonry.api" 60 | 61 | [tool.poetry.scripts] 62 | gimie = 'gimie.cli:app' 63 | 64 | 65 | # Tooling configuration 66 | 67 | [tool.black] 68 | line-length = 79 69 | target-version = ["py38", "py39"] 70 | 71 | [tool.pytest.ini_options] 72 | addopts = ["--doctest-modules", "--cov"] 73 | testpaths = ["gimie", "tests"] 74 | 75 | [tool.pyright] 76 | reportMissingTypeStubs = false 77 | reportUntypedBaseClass = false 78 | 79 | [tool.git-cliff.changelog] 80 | header = "Notable changes introduced in gimie releases are documented in this file\n\n" 81 | body = """ 82 | 83 | {% if version %}\ 84 | ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} 85 | {% else %}\ 86 | ## [unreleased] 87 | {% endif %}\ 88 | {% for group, commits in commits | group_by(attribute="group") %} 89 | ### {{ group | upper_first }} 90 | {% for commit in commits 91 | | filter(attribute="scope") 92 | | sort(attribute="scope") %} 93 | - *({{commit.scope}})* {{ commit.message }} 94 | {%- if commit.breaking %} 95 | {% raw %} {% endraw %}- **BREAKING**: {{commit.breaking_description}} 96 | {%- endif -%} 97 | {%- endfor -%} 98 | {%- for commit in commits %} 99 | {%- if commit.scope -%} 100 | {% else -%} 101 | - {{ commit.message }} 102 | {% if commit.breaking -%} 103 | {% raw %} {% endraw %}- **BREAKING**: {{commit.breaking_description}} 104 | {% endif -%} 105 | {% endif -%} 106 | {% endfor -%} 107 | {% raw %}\n{% endraw %}\ 108 | {% endfor %}\n 109 | """ 110 | footer = "" 111 | 112 | [tool.git-cliff.git] 113 | conventional_commits = true 114 | filter_commits = true 115 | commit_parsers = [ 116 | { message = "^feat", group = "Features" }, 117 | { message = "^(fix|bug)", group = "Bug Fixes" }, 118 | { message = "^doc", group = "Documentation" }, 119 | ] 120 | 121 | commit_preprocessors = [ 122 | { pattern = 'Merged PR #[0-9]: (.*)', replace = "$1" }, 123 | { pattern = " +", replace = " " }, 124 | ] 125 | -------------------------------------------------------------------------------- /gimie/project.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Orchestration of multiple extractors for a given project. 18 | This is the main entry point for end-to-end analysis.""" 19 | from typing import Iterable, Optional, Tuple 20 | 21 | from rdflib import Graph 22 | from rdflib.term import URIRef 23 | from urllib.parse import urlparse 24 | 25 | from gimie.extractors import get_extractor, infer_git_provider 26 | from gimie.graph.operations import properties_to_graph 27 | from gimie.parsers import parse_files 28 | from gimie.utils.uri import validate_url 29 | 30 | 31 | class Project: 32 | """A class to represent a project's git repository. 33 | 34 | 35 | Parameters 36 | ---------- 37 | path: 38 | The full path (URL) of the repository. 39 | base_url: 40 | The base URL of the git remote. Can be used to 41 | specify delimitation between base URL and project name. 42 | git_provider: 43 | The name of the git provider to extract metadata from. 44 | ('git', 'github', 'gitlab') 45 | parser_names: 46 | Names of file parsers to use. ('license'). 47 | If None, default parsers are used (see gimie.parsers.PARSERS). 48 | 49 | Examples 50 | -------- 51 | >>> proj = Project("https://github.com/sdsc-ordes/gimie") 52 | >>> assert isinstance(proj.extract(), Graph) 53 | """ 54 | 55 | def __init__( 56 | self, 57 | path: str, 58 | base_url: Optional[str] = None, 59 | git_provider: Optional[str] = None, 60 | parser_names: Optional[Iterable[str]] = None, 61 | ): 62 | if not git_provider: 63 | git_provider = infer_git_provider(path) 64 | 65 | self.base_url = base_url 66 | self.project_dir = None 67 | self._cloned = False 68 | if validate_url(path): 69 | self.url = path 70 | else: 71 | self.project_dir = path 72 | 73 | self.extractor = get_extractor( 74 | self.url, 75 | git_provider, 76 | base_url=self.base_url, 77 | local_path=self.project_dir, 78 | ) 79 | if parser_names: 80 | self.parsers = set(parser_names) 81 | else: 82 | self.parsers = None 83 | 84 | def extract(self) -> Graph: 85 | """Extract repository metadata from git provider to RDF graph and enrich with 86 | metadata parsed from file contents.""" 87 | 88 | repo = self.extractor.extract() 89 | repo_graph = repo.to_graph() 90 | 91 | files = self.extractor.list_files() 92 | parsed_graph = parse_files(self.url, files, self.parsers) 93 | 94 | repo_graph += parsed_graph 95 | return repo_graph 96 | 97 | 98 | def split_git_url(url: str) -> Tuple[str, str]: 99 | """Split a git URL into base URL and project path. 100 | 101 | Examples 102 | -------- 103 | >>> split_git_url("https://gitlab.com/foo/bar") 104 | ('https://gitlab.com', 'foo/bar') 105 | """ 106 | base_url = urlparse(url).scheme + "://" + urlparse(url).netloc 107 | project = urlparse(url).path.strip("/") 108 | return base_url, project 109 | -------------------------------------------------------------------------------- /gimie/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Files which can be parsed by gimie.""" 18 | from pathlib import Path 19 | from typing import Iterable, NamedTuple, Optional, Set, Type 20 | 21 | from gimie.graph import Property 22 | from gimie.io import Resource 23 | from gimie.parsers.abstract import Parser 24 | from gimie.parsers.license import LicenseParser, is_license_filename 25 | from gimie.parsers.cff import CffParser 26 | 27 | from rdflib import Graph 28 | 29 | 30 | class ParserInfo(NamedTuple): 31 | default: bool 32 | type: Type[Parser] 33 | 34 | 35 | PARSERS = { 36 | "license": ParserInfo(default=True, type=LicenseParser), 37 | "cff": ParserInfo(default=True, type=CffParser), 38 | } 39 | 40 | 41 | def get_parser(name: str) -> Type[Parser]: 42 | """Get a parser by name.""" 43 | parser = PARSERS.get(name, None) 44 | if parser is None: 45 | raise ValueError( 46 | f"Unknown parser: {name}.\n" 47 | f"Supported parsers: {', '.join(PARSERS)}" 48 | ) 49 | return parser.type 50 | 51 | 52 | def list_default_parsers() -> Set[str]: 53 | """List the names of all default parsers.""" 54 | return {k for k, v in PARSERS.items() if v.default} 55 | 56 | 57 | def list_parsers() -> Set[str]: 58 | """List the names of all parsers.""" 59 | return set(PARSERS.keys()) 60 | 61 | 62 | def select_parser( 63 | path: Path, 64 | parsers: Optional[Set[str]] = None, 65 | ) -> Optional[Type[Parser]]: 66 | """Select the appropriate parser from a collection based on a file path. 67 | If no parser is found, return None. 68 | 69 | Parameters 70 | ---------- 71 | path: 72 | The path of the file to parse. 73 | parsers: 74 | A set of parser names. If None, use the default collection. 75 | """ 76 | # Only parse licenses and citations in the root directory 77 | if is_license_filename(path.name) and len(path.parts) == 1: 78 | name = "license" 79 | elif path.name == "CITATION.cff" and len(path.parts) == 1: 80 | name = "cff" 81 | else: 82 | return None 83 | 84 | if name not in (parsers or list_parsers()): 85 | return None 86 | return get_parser(name) 87 | 88 | 89 | def parse_files( 90 | subject: str, 91 | files: Iterable[Resource], 92 | parsers: Optional[Set[str]] = None, 93 | ) -> Graph: 94 | """For each input file, select appropriate parser among a collection and 95 | parse its contents. Return the union of all parsed properties in the form of triples. 96 | If no parser is found for a given file, skip it. 97 | 98 | Parameters 99 | ---------- 100 | subject: 101 | The subject URI of the repository. 102 | files: 103 | A collection of file-like objects. 104 | parsers: 105 | A set of parser names. If None, use the default collection. 106 | """ 107 | parsed_properties = Graph() 108 | for file in files: 109 | parser = select_parser(file.path, parsers) 110 | if not parser: 111 | continue 112 | data = file.open().read() 113 | parsed_properties |= parser(subject).parse(data or b"") 114 | return parsed_properties 115 | -------------------------------------------------------------------------------- /gimie/io.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Standard input interfaces to local or remote resources for gimie.""" 18 | 19 | import io 20 | import os 21 | from pathlib import Path 22 | import requests 23 | from typing import Iterator, Optional, Union 24 | 25 | 26 | class Resource: 27 | """Abstract class for read-only access to local or remote resources via 28 | a file-like interface. 29 | 30 | Parameters 31 | ---------- 32 | path: 33 | The local relative path to the resource. 34 | """ 35 | 36 | path: Path 37 | 38 | def open(self) -> io.RawIOBase: 39 | raise NotImplementedError 40 | 41 | 42 | class LocalResource(Resource): 43 | """Providing read-only access to local data via a file-like interface. 44 | 45 | Examples 46 | -------- 47 | >>> resource = LocalResource("README.md") 48 | """ 49 | 50 | def __init__(self, path: Union[str, os.PathLike]): 51 | self.path: Path = Path(path) 52 | 53 | def open(self) -> io.RawIOBase: 54 | return io.FileIO(self.path, mode="r") 55 | 56 | 57 | class RemoteResource(Resource): 58 | """Provides read-only access to remote data via a file-like interface. 59 | 60 | Parameters 61 | ---------- 62 | url: 63 | The URL where the resource. can be downladed from. 64 | headers: 65 | Optional headers to pass to the request. 66 | 67 | Examples 68 | -------- 69 | >>> url = "https://raw.githubusercontent.com/sdsc-ordes/gimie/main/README.md" 70 | >>> content = RemoteResource("README.md", url).open().read() 71 | >>> assert isinstance(content, bytes) 72 | """ 73 | 74 | def __init__(self, path: str, url: str, headers: Optional[dict] = None): 75 | self.path = Path(path) 76 | self.url = url 77 | self.headers = headers or {} 78 | 79 | def open(self) -> io.RawIOBase: 80 | resp = requests.get( 81 | self.url, headers=self.headers, stream=True 82 | ).iter_content(chunk_size=128) 83 | return IterStream(resp) 84 | 85 | 86 | class IterStream(io.RawIOBase): 87 | """Wraps an iterator under a like a file-like interface. 88 | Empty elements in the iterator are ignored. 89 | 90 | Parameters 91 | ---------- 92 | iterator: 93 | An iterator yielding bytes. 94 | 95 | Examples 96 | -------- 97 | >>> stream = IterStream(iter([b"Hello ", b"", b"World"])) 98 | >>> stream.read() 99 | b'Hello World' 100 | """ 101 | 102 | def __init__(self, iterator: Iterator[bytes]): 103 | self.leftover = b"" 104 | self.iterator = iterator 105 | 106 | def readable(self): 107 | return True 108 | 109 | def readinto(self, b): 110 | try: 111 | l = len(b) # We're supposed to return at most this much 112 | while True: 113 | chunk = self.leftover or next(self.iterator) 114 | # skip empty elements 115 | if not chunk: 116 | continue 117 | output, self.leftover = chunk[:l], chunk[l:] 118 | b[: len(output)] = output 119 | return len(output) 120 | except StopIteration: 121 | return 0 # indicate EOF 122 | -------------------------------------------------------------------------------- /gimie/utils/uri.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Utility functions used throughout gimie.""" 18 | 19 | from typing import List, Literal 20 | from urllib.parse import urlparse 21 | import re 22 | 23 | from gimie.graph.namespaces import GIMIE 24 | 25 | 26 | def validate_url(url: str): 27 | """Checks if input is a valid URL. 28 | credits: https://stackoverflow.com/a/38020041 29 | 30 | Examples 31 | ------------- 32 | >>> validate_url('/data/my_repo') 33 | False 34 | >>> validate_url(532) 35 | False 36 | >>> validate_url('https://www.github.com/sdsc-ordes/gimie') 37 | True 38 | >>> validate_url('github.com/sdsc-ordes/gimie') 39 | False 40 | """ 41 | try: 42 | result = urlparse(url) 43 | return all([result.scheme, result.netloc]) 44 | except AttributeError: 45 | return False 46 | 47 | 48 | def generate_uri(ref: str): 49 | """Given a reference (e.g. commit sha), return a URI. 50 | 51 | Parameters 52 | ---------- 53 | path: 54 | Path to the repository, either local or a URL. 55 | 56 | 57 | Returns 58 | ------- 59 | fair_uri: 60 | A unique resource identifier (URI) for the repository path. 61 | 62 | Examples 63 | -------- 64 | >>> generate_uri("abc") 65 | 'https://sdsc-ordes.github.io/gimie/abc' 66 | """ 67 | return str(GIMIE[ref]) 68 | 69 | 70 | def is_valid_orcid(orcid): 71 | """Check if the input is a valid ORCID according to definition from orcid.org [1]_. 72 | .. [1] [https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier](https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier) 73 | 74 | Parameters 75 | ---------- 76 | orcid: 77 | The ORCID to validate. 78 | 79 | Returns 80 | ------- 81 | bool: 82 | True if the ORCID is valid, False otherwise. 83 | 84 | Examples 85 | -------- 86 | >>> is_valid_orcid("https://orcid.org/0000-0001-2345-6789") 87 | True 88 | >>> is_valid_orcid("0000-0001-2345-6789") 89 | False 90 | >>> is_valid_orcid("http://orcid.org/0000-0001-2345-6789") 91 | False 92 | 93 | """ 94 | return bool( 95 | re.match( 96 | r"(https:\/\/)?orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid) 97 | ) 98 | ) 99 | 100 | 101 | def extract_doi_match(doi): 102 | """Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_. 103 | .. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions) 104 | 105 | Parameters 106 | ---------- 107 | doi: 108 | The DOI to validate. 109 | 110 | Returns 111 | ------- 112 | str: 113 | The extracted short DOI if it is valid, None otherwise. 114 | 115 | Examples 116 | -------- 117 | >>> extract_doi_match("10.5281/zenodo.1234567") 118 | '10.5281/zenodo.1234567' 119 | >>> extract_doi_match("https://doi.org/10.5281/zenodo.1234567") 120 | '10.5281/zenodo.1234567' 121 | """ 122 | match = re.search( 123 | r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE 124 | ) 125 | if match: 126 | return match.group() 127 | -------------------------------------------------------------------------------- /docs/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 41 | 43 | 49 | 50 | 55 | 60 | 67 | 74 | 81 | 86 | 93 | gimie 103 | 104 | 105 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish on Github container registry 2 | 3 | on: 4 | release: 5 | type: [published] 6 | push: 7 | branches: [main] 8 | pull_request: 9 | paths: 10 | - 'pyproject.toml' 11 | - './docker/**' 12 | - '.github/workflows/**' 13 | 14 | env: 15 | REGISTRY: ghcr.io 16 | 17 | jobs: 18 | build-image: 19 | runs-on: ubuntu-latest 20 | if: github.ref != 'refs/heads/main' 21 | permissions: 22 | contents: read 23 | packages: write 24 | 25 | steps: 26 | # https://github.com/actions/checkout 27 | - name: checkout repository 28 | uses: actions/checkout@v4 29 | 30 | - name: lowercase image name 31 | run: | 32 | echo "IMAGE_NAME=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} 33 | 34 | # https://github.com/docker/setup-qemu-action 35 | - name: Set up QEMU 36 | uses: docker/setup-qemu-action@v3.0.0 37 | 38 | # https://github.com/docker/setup-buildx-action 39 | - name: Set up Docker Buildx 40 | id: buildx 41 | uses: docker/setup-buildx-action@v3.0.0 42 | 43 | - name: Get current release version 44 | id: release-version 45 | run: | 46 | version=$(grep -E '^version += +' pyproject.toml | sed -E 's/.*= +//' | sed "s/['\"]//g") 47 | echo "version=${version}" >> $GITHUB_OUTPUT 48 | echo "version_build=${version}_"$(git rev-parse --short "$GITHUB_SHA") >> $GITHUB_OUTPUT 49 | 50 | # https://github.com/docker/build-push-action 51 | - name: Build Docker image 52 | uses: docker/build-push-action@v5.0.0 53 | with: 54 | context: . 55 | platforms: linux/amd64,linux/arm64 56 | file: .docker/Dockerfile 57 | push: false 58 | tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.release-version.outputs.version_build }} 59 | build-args: VERSION_BUILD=${{ steps.release-version.outputs.version_build }} 60 | outputs: type=image,annotation-index.org.opencontainers.image.description=Extract linked metadata from repositories. 61 | 62 | push-image: 63 | runs-on: ubuntu-latest 64 | if: github.ref == 'refs/heads/main' 65 | permissions: 66 | contents: read 67 | packages: write 68 | 69 | steps: 70 | # https://github.com/actions/checkout 71 | - name: checkout repository 72 | uses: actions/checkout@v4 73 | 74 | - name: lowercase image name 75 | run: | 76 | echo "IMAGE_NAME=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} 77 | 78 | # https://github.com/docker/setup-qemu-action 79 | - name: Set up QEMU 80 | uses: docker/setup-qemu-action@v3.0.0 81 | 82 | # https://github.com/docker/setup-buildx-action 83 | - name: Set up Docker Buildx 84 | id: buildx 85 | uses: docker/setup-buildx-action@v3.0.0 86 | 87 | - name: Get current release version 88 | id: release-version 89 | run: | 90 | version=$(grep -E '^version += +' pyproject.toml | sed -E 's/.*= +//' | sed "s/['\"]//g") 91 | echo "version=${version}" >> $GITHUB_OUTPUT 92 | echo "version_build=${version}_"$(git rev-parse --short "$GITHUB_SHA") >> $GITHUB_OUTPUT 93 | 94 | # https://github.com/docker/login-action 95 | - name: Log in to the Container registry 96 | uses: docker/login-action@v3.0.0 97 | with: 98 | registry: ${{ env.REGISTRY }} 99 | username: ${{ github.actor }} 100 | password: ${{ secrets.GITHUB_TOKEN }} 101 | 102 | # https://github.com/docker/metadata-action 103 | - name: Extract metadata (tags, labels) for Docker 104 | id: meta 105 | uses: docker/metadata-action@v5.0.0 106 | with: 107 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 108 | tags: | 109 | type=raw,value=latest,enable=${{ github.event_name == 'push' }} 110 | type=raw,value=${{ needs.build-image.outputs.version_build }},enable=${{ github.event_name == 'push' }} 111 | type=raw,value=${{ needs.build-image.outputs.version }},enable=${{ github.event_name == 'release' }} 112 | 113 | # https://github.com/docker/build-push-action 114 | - name: Push Docker image 115 | uses: docker/build-push-action@v5.0.0 116 | with: 117 | context: . 118 | platforms: linux/amd64,linux/arm64 119 | file: .docker/Dockerfile 120 | push: true 121 | tags: ${{ steps.meta.outputs.tags }} 122 | labels: ${{ steps.meta.outputs.labels }} 123 | build-args: VERSION_BUILD=${{ needs.build-image.outputs.version_build }} 124 | outputs: type=image,annotation-index.org.opencontainers.image.description=Extract linked metadata from repositories. 125 | -------------------------------------------------------------------------------- /gimie/cli.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Command line interface to the gimie package.""" 18 | from enum import Enum 19 | from typing import List, Optional 20 | 21 | import click 22 | import typer 23 | 24 | from gimie import __version__ 25 | from gimie.parsers import get_parser, list_default_parsers, list_parsers 26 | from gimie.project import Project 27 | 28 | app = typer.Typer(add_completion=False) 29 | 30 | 31 | # Used to autogenerate docs with sphinx-click 32 | @click.group() 33 | def cli(): 34 | """Command line group""" 35 | pass 36 | 37 | 38 | class RDFFormatChoice(str, Enum): 39 | ttl = "ttl" 40 | jsonld = "json-ld" 41 | nt = "nt" 42 | 43 | 44 | def version_callback(value: bool): 45 | if value: 46 | print(f"gimie {__version__}") 47 | # Exits successfully 48 | raise typer.Exit() 49 | 50 | 51 | @app.command() 52 | def data( 53 | url: str, 54 | format: RDFFormatChoice = typer.Option( 55 | RDFFormatChoice.ttl, 56 | "--format", 57 | show_choices=True, 58 | help="Output serialization format for the RDF graph.", 59 | ), 60 | base_url: Optional[str] = typer.Option( 61 | None, 62 | "--base-url", 63 | help="Specify the base URL of the git provider. Inferred by default.", 64 | ), 65 | include_parser: Optional[List[str]] = typer.Option( 66 | None, 67 | "--include-parser", 68 | "-I", 69 | help="Only include selected parser. Use 'gimie parsers' to list parsers.", 70 | ), 71 | exclude_parser: Optional[List[str]] = typer.Option( 72 | None, 73 | "--exclude-parser", 74 | "-X", 75 | help="Exclude selected parser.", 76 | ), 77 | version: Optional[bool] = typer.Option( 78 | None, 79 | "--version", 80 | help="Display version and exit", 81 | callback=version_callback, 82 | ), 83 | ): 84 | """Extract linked metadata from a Git repository at the target URL. 85 | 86 | The output is sent to stdout, and turtle is used as the default serialization format. 87 | """ 88 | parser_names = list_default_parsers() 89 | if exclude_parser: 90 | parser_names -= set([parser for parser in exclude_parser]) 91 | if include_parser: 92 | parser_names = set([parser for parser in include_parser]) 93 | proj = Project(url, base_url=base_url, parser_names=parser_names) 94 | repo_meta = proj.extract() 95 | print(repo_meta.serialize(format=format.value)) 96 | 97 | 98 | @app.command() 99 | def advice(url: str): 100 | """Show a metadata completion report for a Git repository 101 | at the target URL. 102 | 103 | NOTE: Not implemented yet""" 104 | ... 105 | raise typer.Exit() 106 | 107 | 108 | @app.command() 109 | def parsers( 110 | verbose: bool = typer.Option( 111 | False, "--verbose", help="Show parser description." 112 | ) 113 | ): 114 | """List available parsers, specifying which are default. 115 | If --verbose is used, show parser description.""" 116 | message = "" 117 | parsers = list_parsers() 118 | default_parsers = list_default_parsers() 119 | 120 | for name in parsers: 121 | # Each parser gets their name in bold green 122 | title = typer.style(name, fg=typer.colors.GREEN, bold=True) 123 | default = " (default)" if name in default_parsers else "" 124 | description = f" - {get_parser(name).__doc__}" if verbose else "" 125 | 126 | parser_line = f"{title}{default}{description}" 127 | message += f"{parser_line}\n" 128 | 129 | typer.echo(message) 130 | 131 | 132 | typer_cli = typer.main.get_command(app) 133 | cli.add_command(typer_cli, "cli") 134 | 135 | 136 | # This callback is triggered when gimie is called without subcommand 137 | @app.callback() 138 | def callback( 139 | version: Optional[bool] = typer.Option( 140 | None, "--version", callback=version_callback 141 | ) 142 | ): 143 | """gimie digs Git repositories for metadata.""" 144 | 145 | 146 | if __name__ == "__main__": 147 | app() 148 | -------------------------------------------------------------------------------- /gimie/parsers/license/__init__.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import csv 18 | from io import BytesIO 19 | import pkgutil 20 | import re 21 | from typing import List, Optional, Set 22 | 23 | import numpy as np 24 | import scipy.sparse as sp 25 | from rdflib.term import URIRef 26 | from rdflib import Graph 27 | from gimie.graph.namespaces import SDO 28 | from gimie.parsers.abstract import Parser, Property 29 | from gimie.utils.text_processing import TfidfVectorizer 30 | 31 | 32 | class LicenseParser(Parser): 33 | """Parse LICENSE body into schema:license . 34 | Uses tf-idf-based matching.""" 35 | 36 | def __init__(self, subject: str): 37 | super().__init__(subject) 38 | 39 | def parse(self, data: bytes) -> Graph: 40 | """Extracts an spdx URL from a license file and returns a 41 | graph with a single triple . 42 | If no matching URL is found, an empty graph is returned. 43 | """ 44 | license_facts = Graph() 45 | license_url = match_license(data) 46 | 47 | if license_url: 48 | license_facts.add((self.subject, SDO.license, URIRef(license_url))) 49 | return license_facts 50 | 51 | 52 | def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]: 53 | """Given a license file, returns the url of the most similar spdx license. 54 | This is done using TF-IDF on the license text and getting the 55 | closest match in the SPDX license corpus based on cosine similarity. 56 | 57 | Parameters 58 | ---------- 59 | data: 60 | The license body as bytes. 61 | 62 | Examples 63 | -------- 64 | >>> match_license(open('LICENSE', 'rb').read()) 65 | 'https://spdx.org/licenses/Apache-2.0.html' 66 | """ 67 | # Compute tfidf vector for input license 68 | vectorizer = load_tfidf_vectorizer() 69 | input_vec = vectorizer.transform([data.decode()]) 70 | 71 | # Load ids and tfidf vectors for spdx licenses 72 | spdx_licenses = load_spdx_ids() 73 | spdx_vecs = load_tfidf_matrix() 74 | # Compute cosine similarity between input_vec and spdx vectors 75 | sim: np.ndarray = (input_vec * spdx_vecs.T).todense() 76 | # Pick the most similar spdx vector 77 | closest_idx = np.argmax(sim) 78 | # If similarity is below threshold, return None 79 | if sim[0, closest_idx] < min_similarity: 80 | return None 81 | closest_id = spdx_licenses[closest_idx] 82 | return f"https://spdx.org/licenses/{closest_id}.html" 83 | 84 | 85 | def load_tfidf_vectorizer() -> TfidfVectorizer: 86 | """Load tfidf matrix and vectorizer from disk.""" 87 | 88 | data = pkgutil.get_data(__name__, "data/tfidf_vectorizer.json") 89 | if data is None: 90 | raise FileNotFoundError("Could not find tfidf_vectorizer.json") 91 | return TfidfVectorizer.model_validate_json(data) 92 | 93 | 94 | def load_spdx_ids() -> List[str]: 95 | """Load spdx licenses from disk.""" 96 | data = pkgutil.get_data(__name__, "data/spdx_licenses.csv") 97 | if data is None: 98 | raise FileNotFoundError("Could not find spdx_licenses.csv") 99 | reader = csv.reader(data.decode().split("\n")) 100 | return [l[0] for l in reader if l] 101 | 102 | 103 | def load_tfidf_matrix() -> sp.csr_matrix: 104 | """Load pre-computed tfidf matrix of spdx licenses from disk. 105 | Matrix has dimensions (n_licenses, n_features).""" 106 | data = pkgutil.get_data(__name__, "data/tfidf_matrix.npz") 107 | if data is None: 108 | raise FileNotFoundError("Could not find tfidf_matrix.npz") 109 | return sp.load_npz(BytesIO(data)) 110 | 111 | 112 | def is_license_filename(filename: str) -> bool: 113 | """Given an input filename, returns a boolean indicating whether the filename path looks like a license. 114 | 115 | Parameters 116 | ---------- 117 | filename: 118 | A filename to check. 119 | 120 | Examples 121 | -------- 122 | >>> is_license_filename('LICENSE-APACHE') 123 | True 124 | >>> is_license_filename('README.md') 125 | False 126 | """ 127 | if filename.startswith("."): 128 | return False 129 | pattern = r".*(license(s)?.*|lizenz|reus(e|ing).*|copy(ing)?.*)(\.(txt|md|rst))?$" 130 | if re.match(pattern, filename, flags=re.IGNORECASE): 131 | return True 132 | return False 133 | -------------------------------------------------------------------------------- /gimie/extractors/git.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Extractor which uses a locally available (usually cloned) repository.""" 18 | from dataclasses import dataclass 19 | from datetime import datetime 20 | from functools import cached_property 21 | import os 22 | import shutil 23 | import tempfile 24 | from typing import List, Optional 25 | import uuid 26 | 27 | import git 28 | import pydriller 29 | 30 | from gimie.io import LocalResource 31 | from gimie.models import Person, Repository 32 | from gimie.extractors.abstract import Extractor 33 | from pathlib import Path 34 | 35 | 36 | @dataclass 37 | class GitExtractor(Extractor): 38 | """ 39 | This class is responsible for extracting metadata from a git repository. 40 | 41 | Parameters 42 | ---------- 43 | url: str 44 | The url of the git repository. 45 | base_url: Optional[str] 46 | The base url of the git remote. 47 | local_path: Optional[str] 48 | The local path where the cloned git repository is located. 49 | 50 | Attributes 51 | ---------- 52 | uri: Optional[str] 53 | The URI to assign the repository in RDF. 54 | repository: Repository 55 | The repository we are extracting metadata from. 56 | """ 57 | 58 | url: str 59 | base_url: Optional[str] = None 60 | local_path: Optional[str] = None 61 | _cloned: bool = False 62 | 63 | def extract(self) -> Repository: 64 | # Assuming author is the first person to commit 65 | self.repository = self._repo_data 66 | 67 | repo_meta = dict( 68 | authors=[self._get_creator()], 69 | contributors=self._get_contributors(), 70 | date_created=self._get_creation_date(), 71 | date_modified=self._get_modification_date(), 72 | name=self.path, 73 | url=self.url, 74 | ) 75 | 76 | return Repository(**repo_meta) # type: ignore 77 | 78 | def list_files(self) -> List[LocalResource]: 79 | self.repository = self._repo_data 80 | file_list = [] 81 | 82 | for path in Path(self.local_path).rglob("*"): # type: ignore 83 | if (path.parts[0] == ".git") or not path.is_file(): 84 | continue 85 | file_list.append(LocalResource(path)) 86 | 87 | return file_list 88 | 89 | def __del__(self): 90 | """Cleanup the cloned repo if it was cloned and is located in tempdir.""" 91 | try: 92 | # Can't be too careful with temp files 93 | tempdir = tempfile.gettempdir() 94 | if ( 95 | self.local_path 96 | and self._cloned 97 | and self.local_path.startswith(tempdir) 98 | and tempdir != os.getcwd() 99 | ): 100 | shutil.rmtree(self.local_path) 101 | except AttributeError: 102 | pass 103 | 104 | @cached_property 105 | def _repo_data(self) -> pydriller.Repository: 106 | """Get the repository data by accessing local data or cloning.""" 107 | if self.local_path is None: 108 | self._cloned = True 109 | self.local_path = tempfile.TemporaryDirectory().name 110 | git.Repo.clone_from(self.url, self.local_path) # type: ignore 111 | return pydriller.Repository(self.local_path) 112 | 113 | def _get_contributors(self) -> List[Person]: 114 | """Get the authors of the repository.""" 115 | authors = set() 116 | for commit in self.repository.traverse_commits(): 117 | if commit.author is not None: 118 | authors.add((commit.author.name, commit.author.email)) 119 | return [self._dev_to_person(name, email) for name, email in authors] 120 | 121 | def _get_creation_date(self) -> Optional[datetime]: 122 | """Get the creation date of the repository.""" 123 | try: 124 | return next(self.repository.traverse_commits()).author_date 125 | except StopIteration: 126 | return None 127 | 128 | def _get_modification_date(self) -> Optional[datetime]: 129 | """Get the last modification date of the repository.""" 130 | commit = None 131 | try: 132 | for commit in self.repository.traverse_commits(): 133 | pass 134 | except (StopIteration, NameError): 135 | pass 136 | finally: 137 | return commit.author_date if commit else None 138 | 139 | def _get_creator(self) -> Optional[Person]: 140 | """Get the creator of the repository.""" 141 | try: 142 | creator = next(self.repository.traverse_commits()).author 143 | return self._dev_to_person(creator.name, creator.email) 144 | except StopIteration: 145 | return None 146 | 147 | def _dev_to_person( 148 | self, name: Optional[str], email: Optional[str] 149 | ) -> Person: 150 | """Convert a Developer object to a Person object.""" 151 | if name is None: 152 | uid = str(uuid.uuid4()) 153 | else: 154 | uid = name.replace(" ", "_").lower() 155 | dev_id = f"{self.url}/{uid}" 156 | return Person( 157 | _id=dev_id, 158 | identifier=uid, 159 | name=name, 160 | email=email, 161 | ) 162 | -------------------------------------------------------------------------------- /gimie/models.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Data models to represent nodes in the graph generated by gimie.""" 18 | from __future__ import annotations 19 | from dataclasses import dataclass, field 20 | from datetime import datetime 21 | import datetime 22 | from typing import List, Optional, Union 23 | 24 | from calamus.schema import JsonLDSchema 25 | from calamus import fields 26 | from rdflib import Graph 27 | 28 | from gimie.graph.namespaces import SDO 29 | 30 | 31 | @dataclass(order=True) 32 | class Release: 33 | """ 34 | This class represents a release of a repository. 35 | 36 | Parameters 37 | ---------- 38 | tag: str 39 | The tag of the release. 40 | date: datetime.datetime 41 | The date of the release. 42 | commit_hash: str 43 | The commit hash of the release. 44 | """ 45 | 46 | tag: str = field(compare=False) 47 | date: datetime = field(compare=True) 48 | commit_hash: str = field(compare=False) 49 | 50 | 51 | @dataclass 52 | class Organization: 53 | """See http//schema.org/Organization""" 54 | 55 | _id: str 56 | name: str 57 | legal_name: Optional[str] = None 58 | email: Optional[List[str]] = None 59 | description: Optional[str] = None 60 | logo: Optional[str] = None 61 | 62 | 63 | class OrganizationSchema(JsonLDSchema): 64 | _id = fields.Id() 65 | name = fields.String(SDO.name) 66 | legal_name = fields.String(SDO.legalName) 67 | email = fields.String(SDO.email) 68 | description = fields.String(SDO.description) 69 | logo = fields.IRI(SDO.logo) 70 | 71 | class Meta: 72 | rdf_type = SDO.Organization 73 | model = Organization 74 | 75 | 76 | @dataclass 77 | class Person: 78 | """See http//schema.org/Person""" 79 | 80 | _id: str 81 | identifier: str 82 | name: Optional[str] = None 83 | email: Optional[str] = None 84 | affiliations: Optional[List[Organization]] = None 85 | 86 | def __str__(self): 87 | name = f"({self.name}) " if self.name else "" 88 | email = f"<{self.email}> " if self.email else "" 89 | orgs = ( 90 | f"[{', '.join([org.name for org in self.affiliations])}]" 91 | if self.affiliations 92 | else "" 93 | ) 94 | return f"{self.identifier} {name}{email}{orgs}".strip(" ") 95 | 96 | 97 | class PersonSchema(JsonLDSchema): 98 | _id = fields.Id() 99 | identifier = fields.String(SDO.identifier) 100 | name = fields.String(SDO.name) 101 | affiliations = fields.Nested( 102 | SDO.affiliation, OrganizationSchema, many=True 103 | ) 104 | 105 | class Meta: 106 | rdf_type = SDO.Person 107 | model = Person 108 | 109 | 110 | @dataclass 111 | class Repository: 112 | """This class represents a git repository. 113 | It does not contain any information about the content of the repository. 114 | See https://schema.org/SoftwareSourceCode 115 | """ 116 | 117 | url: str 118 | name: str 119 | 120 | authors: Optional[List[Union[Organization, Person]]] = None 121 | contributors: Optional[List[Person]] = None 122 | date_created: Optional[datetime] = None 123 | date_modified: Optional[datetime] = None 124 | date_published: Optional[datetime] = None 125 | description: Optional[str] = None 126 | download_url: Optional[str] = None 127 | identifier: Optional[str] = None 128 | keywords: Optional[List[str]] = None 129 | licenses: Optional[List[str]] = None 130 | parent_repository: Optional[str] = None 131 | prog_langs: Optional[List[str]] = None 132 | version: Optional[str] = None 133 | 134 | @property 135 | def _id(self) -> str: 136 | """Unique identifier for the repository.""" 137 | return self.url 138 | 139 | def to_graph(self) -> Graph: 140 | """Convert repository to RDF graph.""" 141 | jd = RepositorySchema().dumps(self) 142 | g: Graph = Graph().parse(format="json-ld", data=str(jd)) 143 | g.bind("schema", SDO) 144 | return g 145 | 146 | def serialize(self, format: str = "ttl", **kwargs) -> str: 147 | """Serialize the RDF graph representing the instance.""" 148 | return self.to_graph().serialize(format=format, **kwargs) # type: ignore 149 | 150 | def jsonld(self) -> str: 151 | """Alias for jsonld serialization.""" 152 | return self.serialize(format="json-ld") 153 | 154 | 155 | class RepositorySchema(JsonLDSchema): 156 | """This defines the schema used for json-ld serialization.""" 157 | 158 | _id = fields.Id() 159 | authors = fields.Nested( 160 | SDO.author, [PersonSchema, OrganizationSchema], many=True 161 | ) 162 | contributors = fields.Nested(SDO.contributor, PersonSchema, many=True) 163 | date_created = fields.Date(SDO.dateCreated) 164 | date_modified = fields.Date(SDO.dateModified) 165 | date_published = fields.Date(SDO.datePublished) 166 | description = fields.String(SDO.description) 167 | download_url = fields.IRI(SDO.downloadUrl) 168 | identifier = fields.String(SDO.identifier) 169 | keywords = fields.List(SDO.keywords, fields.String) 170 | licenses = fields.List(SDO.license, fields.IRI) 171 | name = fields.String(SDO.name) 172 | parent_repository = fields.IRI(SDO.isBasedOn) 173 | prog_langs = fields.List(SDO.programmingLanguage, fields.String) 174 | url = fields.IRI(SDO.codeRepository) 175 | version = fields.String(SDO.version) 176 | 177 | class Meta: 178 | rdf_type = SDO.SoftwareSourceCode 179 | model = Repository 180 | add_value_types = False 181 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Notable changes introduced in gimie releases are documented in this file 2 | 3 | 4 | ## [0.7.2] - 2024-12-18 5 | 6 | ### Bug Fixes 7 | 8 | - *(cff)* doi structure parsing (#121) 9 | 10 | 11 | ## [0.7.1] - 2024-12-09 12 | 13 | ### Bug Fixes 14 | 15 | - *(dependency missing)* Added pyyaml (#119) 16 | 17 | 18 | ## [0.7.0] - 2024-11-28 19 | 20 | ### Bug Fixes 21 | 22 | - *(cff)* enforce valid urls as doi (#108)- spelling mistake in run as library docs (#113) 23 | 24 | ### Documentation 25 | - update gimie API examples (#105) 26 | - add CFF file (#111) 27 | 28 | ### Features 29 | 30 | - *(parser)* extract authors from CFF files (#115)- add parsers support (#97) 31 | - cff to doi parser (#107) 32 | 33 | 34 | ## [0.6.0] - 2023-10-19 35 | 36 | ### Bug Fixes 37 | 38 | - *(deps)* switch to scancode mini (#88) 39 | - *(docker)* push action was missing buildx (#91) 40 | - *(github)* replace superseded schema:isBasedOnUrl property (#80)- incorrect mapping for schema:codeRepository (#64) 41 | - *(license)* NOASSERTION should not return triples. (#66) 42 | 43 | ### Features 44 | 45 | - *(conventional-PRs)* all PRs will need to follow conventional format 46 | - *(conventional-PRs)* all PRs will need to follow conventional format 47 | - *(github.py)* Get "forked from" property of a repository (#79) 48 | - *(io)* file-like interface to remote resources (#70)- license matcher for git extractor (#78) 49 | 50 | 51 | ## [0.5.1] - 2023-07-10 52 | 53 | ### Bug Fixes 54 | 55 | - incorrect mapping for schema:codeRepository (#64) 56 | 57 | 58 | ## [0.5.0] - 2023-07-04 59 | 60 | ### Bug Fixes 61 | 62 | - *(gitlab)* extraction of author on user-owned projects (#57) 63 | 64 | ### Documentation 65 | 66 | - add docs website (#58) 67 | 68 | ### Features 69 | 70 | - *(gitlab)* support private instances (#62) 71 | 72 | 73 | ## [0.4.0] - 2023-06-09 74 | 75 | ### Bug Fixes 76 | 77 | - *(docs)* execute Makefile rule with poetry 78 | - *(gitlab)* edge case where no release available 79 | - *(gitlab)* pass user node to _get_author instead of parent node 80 | - *(gitlab)* rm debug breakpoint 81 | - *(gitlab)* extraction of author on user-owned projects (#57)- gitlab download url 82 | - prevent license finder from picking up docs files 83 | 84 | ### Documentation 85 | 86 | - *(api)* reduce autodoc ToC depth 87 | - *(cli)* add and configure sphinx-click to work with typer 88 | - *(deps)* introduce doc dependency group 89 | - *(git)* rm duplicate attibute from docstring 90 | - *(setup)* add sphinx configuration 91 | - *(style)* add logo + favicon 92 | - *(style)* add logo to front page 93 | - *(theme)* furo -> sphinxawesome 94 | - *(theme)* add sphinx_design extension, downgrade to sphinx6 for compat 95 | - *(tokens)* Add tutorial for encrypted tokens 96 | - *(tokens)* fix windows instructions- add Makefile rule to generate sphinx website 97 | - initial sphinx website with apidoc 98 | - add apidoc output to gitignore 99 | - add intro pages 100 | - improve header names 101 | - add quickstart section, enable tabbing and crossref 102 | - add sphinx-tabs as doc dep 103 | - add sphinx-copybutton extension 104 | - add changelog and configure git-cliff 105 | - replace deprecated commonmark parser with myst 106 | - enable placeholder highlighting extension 107 | - improve index format 108 | - add windows variant for env var 109 | - add docs website (#58) 110 | - update readme and add docs badge 111 | 112 | ### Features 113 | 114 | - *(gitlab)* fallback to rest api if author missing from graphql. make type hints py38 compat. 115 | - *(io)* Allow rdflib kwargs in serialize()- use GraphQL API in gh extractor (#33) 116 | - Git extractor (#42) 117 | - disallow local paths (#46) 118 | 119 | 120 | ## [0.3.0] - 2023-02-24 121 | 122 | ### Bug Fixes 123 | 124 | - exclude hidden files from license search 125 | - correctly handle one or multiple license paths 126 | - temporarily disable scancode (#19) 127 | - rename GITHUB_TOKEN to ACCESS_TOKEN 128 | - change token back to ACCESS_TOKEN since GITHUB_TOKEN failed 129 | - GITHUB_TOKEN must be prefixed with github as environment variable 130 | - set test workflow back to using ACCESS_TOKEN as a repo secret 131 | - add .dockerignore, copy necessary files only and improve comments 132 | - rename container-publish.yml into docker-publish.yml 133 | - 'building docker image' instead of 'building docker container' 134 | 135 | ### Documentation 136 | 137 | - define initial contributing guidelines 138 | - add usage examples in README 139 | - update copyright notice in license 140 | - specify type hints and rm unused imports in LicenseMetadata 141 | - add dev status in readme 142 | - document the release process in the readme 143 | - readme badges (#25) 144 | - add section to the readme on how to provide a github token 145 | - adapt documentation to usage of ACCESS_TOKEN instead of GITHUB_TOKEN 146 | - adapt readme to installation with makefile 147 | - give options to install either PyPI or dev version of gimie 148 | - add message for docker-build Makefile rule 149 | - add image annotations to dockerfile 150 | - add docker instructions in readme 151 | 152 | ### Features 153 | 154 | - *(cli)* add CLI skeleton (#9)- initial project definition with pyproject.toml 155 | - add placeholder folders 156 | - add placeholder tests 157 | - add basic repo class and placeholder source interfaces 158 | - add console entrypoint definition in pyproject.toml 159 | - add GitMetadata methods to get commit authors and repository creation date 160 | - add method to get releases date and commit hash 161 | - sort releases by date 162 | - add method to get git repo creator 163 | - add unit tests for git source 164 | - Created a license finder using scancode toolkit 165 | - Added triple serialization of license result (spdx url) 166 | - use cached property from functools 167 | - added a make_graph script. Now only contains add_license_to_graph(). 168 | - Created software class, and make graph functions, black reformat 169 | - add license scanner (#12) 170 | - add prototype for RDF graph serialization (#15) 171 | - initial architecture with GithubExtractor (#23) 172 | - add python-dotenv to dependecies 173 | - pick up github token from the environment variables 174 | - add `.env.dist` file as an example for a `.env` file 175 | - provide option to provide github_token when calling extractor 176 | - add pre-commit to dependencies 177 | - add makefile to make installation easier 178 | - add Dockerfile and entrypoint.sh 179 | - add Makefile rule to build the docker image 180 | - add github workflow to push image to github container registry 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![gimie](docs/logo.svg)](https://github.com/sdsc-ordes/gimie) 2 | 3 | [![PyPI version](https://badge.fury.io/py/gimie.svg)](https://badge.fury.io/py/gimie) [![Python Poetry Test](https://github.com/sdsc-ordes/gimie/actions/workflows/poetry-pytest.yml/badge.svg)](https://github.com/sdsc-ordes/gimie/actions/workflows/poetry-pytest.yml) [![docs](https://github.com/sdsc-ordes/gimie/actions/workflows/sphinx-docs.yml/badge.svg)](https://sdsc-ordes.github.io/gimie) [![Coverage Status](https://coveralls.io/repos/github/sdsc-ordes/gimie/badge.svg?branch=main)](https://coveralls.io/github/sdsc-ordes/gimie?branch=main) 4 | 5 | Gimie (GIt Meta Information Extractor) is a python library and command line tool to extract structured metadata from git repositories. 6 | 7 | 8 | ## Context 9 | Scientific code repositories contain valuable metadata which can be used to enrich existing catalogues, platforms or databases. This tool aims to easily extract structured metadata from a generic git repositories. It can extract extract metadata from the Git provider (GitHub or GitLab) or from the git index itself. 10 | 11 | ---------------------------------------------------------------------- 12 | 13 | Using Gimie: easy peasy, it's a 3 step process. 14 | 15 | ## 1: Installation 16 | 17 | To install the stable version on PyPI: 18 | 19 | ```shell 20 | pip install gimie 21 | ``` 22 | 23 | To install the dev version from github: 24 | 25 | ```shell 26 | pip install git+https://github.com/sdsc-ordes/gimie.git@main#egg=gimie 27 | ``` 28 | 29 | Gimie is also available as a docker container hosted on the [Github container registry](https://github.com/sdsc-ordes/gimie/pkgs/container/gimie): 30 | 31 | ```shell 32 | docker pull ghcr.io/sdsc-ordes/gimie:latest 33 | 34 | # The access token can be provided as an environment variable 35 | docker run -e GITHUB_TOKEN=$GITHUB_TOKEN ghcr.io/sdsc-ordes/gimie:latest gimie data 36 | ``` 37 | 38 | ## 2 : Set your credentials 39 | 40 | In order to access the github api, you need to provide a github token with the `read:org` scope. 41 | 42 | ### A. Create access tokens 43 | 44 | New to access tokens? Or don't know how to get your Github / Gitlab token ? 45 | 46 | Have no fear, see 47 | [here for Github tokens](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) and [here for Gitlab tokens](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html). 48 | (Note: tokens are as precious as passwords! Treat them as such.) 49 | 50 | ### B. Set your access tokens via the Terminal 51 | 52 | Gimie will use your access tokens to gather information for you. If you want info about a Github repo, Gimie needs your Github token; if you want info about a Gitlab Project then Gimie needs your Gitlab token. 53 | 54 | Add your tokens one by one in your terminal: 55 | your Github token: 56 | ```bash 57 | export GITHUB_TOKEN= 58 | ``` 59 | and/or your Gitlab token: 60 | ```bash 61 | export GITLAB_TOKEN= 62 | ``` 63 | 64 | ## 3: GIMIE info ! Run Gimie 65 | 66 | ### As a command line tool 67 | 68 | ```shell 69 | gimie data https://github.com/numpy/numpy 70 | ``` 71 | (want a Gitlab project instead? Just replace the URL in the command line) 72 | 73 | ### As a python library 74 | 75 | ```python 76 | from gimie.project import Project 77 | proj = Project("https://github.com/numpy/numpy") 78 | 79 | # To retrieve the rdflib.Graph object 80 | g = proj.extract() 81 | 82 | # To retrieve the serialized graph 83 | g_in_ttl = g.serialize(format='ttl') 84 | print(g_in_ttl) 85 | ``` 86 | For more advanced use see [the documentation](https://sdsc-ordes.github.io/gimie/intro/usage_python.html). 87 | ## Outputs 88 | 89 | The default output is [Turtle](https://www.w3.org/TR/turtle/), a textual syntax for [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) data model. We follow the schema recommended by [codemeta](https://codemeta.github.io/). 90 | Supported formats are turtle, json-ld and n-triples (by specifying the `--format` argument in your call i.e. `gimie data https://github.com/numpy/numpy --format 'ttl'`). 91 | 92 | With no specifications, Gimie will print results in the terminal. Want to save Gimie output to a file? Add your file path to the end : `gimie data https://github.com/numpy/numpy > path_to_output/gimie_output.ttl` 93 | 94 | ---------------------------------------------------------------------- 95 | 96 | ## Contributing 97 | 98 | All contributions are welcome. New functions and classes should have associated tests and docstrings following the [numpy style guide](https://numpydoc.readthedocs.io/en/latest/format.html). 99 | 100 | The code formatting standard we use is [black](https://github.com/psf/black), with `--line-length=79` to follow [PEP8](https://peps.python.org/pep-0008/) recommendations. We use [pytest](https://docs.pytest.org/en/7.2.x/) as our testing framework. This project uses [pyproject.toml](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/) to define package information, requirements and tooling configuration. 101 | 102 | ### For development: 103 | 104 | activate a conda or virtual environment with Python 3.8 or higher 105 | 106 | ```shell 107 | git clone https://github.com/sdsc-ordes/gimie && cd gimie 108 | make install 109 | ``` 110 | 111 | run tests: 112 | 113 | ```shell 114 | make test 115 | ``` 116 | 117 | run checks: 118 | 119 | ```shell 120 | make check 121 | ``` 122 | for an easier use Github/Gitlab APIs, place your access tokens in the `.env` file: (and don't worry, the `.gitignore` will ignore them when you push to GitHub) 123 | 124 | ``` 125 | cp .env.dist .env 126 | ``` 127 | 128 | build documentation: 129 | 130 | ```shell 131 | make doc 132 | ``` 133 | 134 | ## Releases and Publishing on Pypi 135 | 136 | Releases are done via github release 137 | 138 | - a release will trigger a github workflow to publish the package on Pypi 139 | - Make sure to update to a new version in `pyproject.toml` and `conf.py` before making the release 140 | - It is possible to test the publishing on Pypi.test by running a manual workflow: go to github actions and run the Workflow: 'Publish on Pypi Test' 141 | 142 | ## Copyright 143 | Copyright © 2024-2025 Swiss Data Science Center (SDSC),[www.datascience.ch](http://www.datascience.ch/), ROR: [ror.org/02hdt9m26](https://ror.org/02hdt9m26). All rights reserved. The SDSC is a Swiss National Research Infrastructure, jointly established and legally represented by the École Polytechnique Fédérale de Lausanne (EPFL) and the Eidgenössische Technische Hochschule Zürich (ETH Zürich) as a société simple. This copyright encompasses all materials, software, documentation, and other content created and developed by the SDSC. 144 | -------------------------------------------------------------------------------- /gimie/parsers/cff.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from io import BytesIO 18 | import re 19 | from typing import List, Optional, Set 20 | import yaml 21 | from rdflib.term import URIRef 22 | from rdflib import Graph, URIRef, Literal 23 | from rdflib.namespace import RDF 24 | from gimie import logger 25 | from gimie.graph.namespaces import SDO, MD4I 26 | from gimie.parsers.abstract import Parser 27 | from gimie.utils.uri import is_valid_orcid, extract_doi_match 28 | 29 | 30 | class CffParser(Parser): 31 | """Parse DOI and authors from CITATION.cff.""" 32 | 33 | def __init__(self, subject: str): 34 | super().__init__(subject) 35 | 36 | def parse(self, data: bytes) -> Graph: 37 | """Extracts DOIs and list of authors from a CFF file and returns a 38 | graph with triples 39 | and a number of author objects with and values. 40 | If no DOIs are found, they will not be included in the graph. 41 | If no authors are found, they will not be included in the graph. 42 | If neither authors nor DOIs are found, an empty graph is returned. 43 | """ 44 | extracted_cff_triples = Graph() 45 | dois = get_cff_doi(data) 46 | authors = get_cff_authors(data) 47 | 48 | if dois: 49 | for doi in dois: 50 | extracted_cff_triples.add( 51 | (self.subject, SDO.citation, URIRef(doi)) 52 | ) 53 | if not authors: 54 | return extracted_cff_triples 55 | for author in authors: 56 | if is_valid_orcid(author["orcid"]): 57 | orcid = URIRef(author["orcid"]) 58 | extracted_cff_triples.add( 59 | (self.subject, SDO.author, URIRef(orcid)) 60 | ) 61 | extracted_cff_triples.add( 62 | ( 63 | URIRef(orcid), 64 | SDO.name, 65 | Literal( 66 | author["given-names"] 67 | + " " 68 | + author["family-names"] 69 | ), 70 | ) 71 | ) 72 | extracted_cff_triples.add( 73 | ( 74 | orcid, 75 | MD4I.orcidId, 76 | Literal(orcid), 77 | ) 78 | ) 79 | extracted_cff_triples.add( 80 | ( 81 | orcid, 82 | SDO.affiliation, 83 | Literal(author["affiliation"]), 84 | ) 85 | ) 86 | extracted_cff_triples.add((orcid, RDF.type, SDO.Person)) 87 | return extracted_cff_triples 88 | 89 | 90 | def doi_to_url(doi: str) -> str: 91 | """Formats a doi to an https URL to doi.org. 92 | 93 | Parameters 94 | ---------- 95 | doi 96 | doi where the scheme (e.g. https://) and 97 | hostname (e.g. doi.org) may be missing. 98 | 99 | Returns 100 | ------- 101 | str 102 | doi formatted as a valid url. Base url 103 | is set to https://doi.org when missing. 104 | 105 | Examples 106 | -------- 107 | >>> doi_to_url("10.0000/example.abcd") 108 | 'https://doi.org/10.0000/example.abcd' 109 | >>> doi_to_url("doi.org/10.0000/example.abcd") 110 | 'https://doi.org/10.0000/example.abcd' 111 | >>> doi_to_url("https://doi.org/10.0000/example.abcd") 112 | 'https://doi.org/10.0000/example.abcd' 113 | """ 114 | 115 | doi_match = extract_doi_match(doi) 116 | 117 | if doi_match is None: 118 | raise ValueError(f"Not a valid DOI: {doi}") 119 | 120 | return f"https://doi.org/{doi_match}" 121 | 122 | 123 | def get_cff_doi(data: bytes) -> Optional[list[str]]: 124 | """Given a CFF file, returns a list of DOIs, if any. 125 | 126 | Parameters 127 | ---------- 128 | data 129 | The cff file body as bytes. 130 | 131 | Returns 132 | ------- 133 | list of str, optional 134 | DOIs formatted as valid URLs 135 | 136 | Examples 137 | -------- 138 | >>> get_cff_doi(bytes("identifiers:\\n - type: doi\\n value: 10.5281/zenodo.1234\\n - type: doi\\n value: 10.5281/zenodo.5678", encoding="utf8")) 139 | ['https://doi.org/10.5281/zenodo.1234', 'https://doi.org/10.5281/zenodo.5678'] 140 | >>> get_cff_doi(bytes("identifiers:\\n - type: doi\\n value: 10.5281/zenodo.9012", encoding="utf8")) 141 | ['https://doi.org/10.5281/zenodo.9012'] 142 | >>> get_cff_doi(bytes("abc: def", encoding="utf8")) 143 | """ 144 | 145 | try: 146 | cff = yaml.safe_load(data.decode()) 147 | except yaml.scanner.ScannerError: 148 | logger.warning("cannot read CITATION.cff, skipped.") 149 | return None 150 | 151 | doi_urls = [] 152 | 153 | try: 154 | identifiers = cff["identifiers"] 155 | except (KeyError, TypeError): 156 | logger.warning( 157 | "CITATION.cff does not contain a valid 'identifiers' key." 158 | ) 159 | return None 160 | 161 | for identifier in identifiers: 162 | if identifier.get("type") == "doi": 163 | try: 164 | doi_url = doi_to_url(identifier["value"]) 165 | doi_urls.append(doi_url) 166 | except ValueError as err: 167 | logger.warning(err) 168 | 169 | return doi_urls or None 170 | 171 | 172 | def get_cff_authors(data: bytes) -> Optional[List[dict[str, str]]]: 173 | """Given a CFF file, returns a list of dictionaries containing orcid, affiliation, first and last names of authors, if any. 174 | 175 | Parameters 176 | ---------- 177 | data 178 | The cff file body as bytes. 179 | 180 | Returns 181 | ------- 182 | list(dict), optional 183 | orcid, names strings of authors 184 | 185 | """ 186 | 187 | try: 188 | cff = yaml.safe_load(data.decode()) 189 | except yaml.scanner.ScannerError: 190 | logger.warning("cannot read CITATION.cff, skipped.") 191 | return None 192 | 193 | authors = [] 194 | try: 195 | for author in cff["authors"]: 196 | author_dict = { 197 | "family-names": author.get("family-names", ""), 198 | "given-names": author.get("given-names", ""), 199 | "orcid": author.get("orcid", ""), 200 | "affiliation": author.get("affiliation", ""), 201 | } 202 | authors.append(author_dict) 203 | except KeyError: 204 | logger.warning("CITATION.cff does not contain an 'authors' key.") 205 | return None 206 | 207 | return authors if authors else None 208 | -------------------------------------------------------------------------------- /gimie/utils/text_processing.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from functools import reduce 3 | import re 4 | from typing import ( 5 | Dict, 6 | Iterable, 7 | List, 8 | Literal, 9 | Optional, 10 | Tuple, 11 | ) 12 | 13 | import numpy as np 14 | from pydantic import BaseModel, Field 15 | from pydantic.dataclasses import dataclass 16 | import scipy.sparse as sp 17 | 18 | 19 | def tokenize(text: str, sep: str = " ") -> List[str]: 20 | """Basic tokenizer. Removes punctuation, but not stop words. 21 | 22 | Parameters 23 | ---------- 24 | text: 25 | Text to tokenize. 26 | sep: 27 | Token separator. 28 | 29 | Examples 30 | -------- 31 | >>> tokenize("Is this a test? Yes it is.") 32 | ['is', 'this', 'a', 'test', 'yes', 'it', 'is'] 33 | """ 34 | text = text.lower() 35 | text = re.sub(r"[\.|,|;|:|!|?|\n]", "", text) 36 | return text.split(sep) 37 | 38 | 39 | def extract_ngrams(tokens: List[str], size: int = 1) -> List[str]: 40 | """Extract ngrams from a list of tokens. 41 | 42 | Parameters 43 | ---------- 44 | tokens: 45 | List of tokens. 46 | size: 47 | Size of ngrams to extract. 48 | 49 | Examples 50 | -------- 51 | >>> extract_ngrams(["this", "is", "a", "test"], size=2) 52 | ['this is', 'is a', 'a test'] 53 | """ 54 | return [ 55 | " ".join(tokens[i : i + size]) 56 | for i in range(0, len(tokens) - size + 1) 57 | ] 58 | 59 | 60 | def get_ngram_counts( 61 | doc: str, ngram_range: Tuple[int, int] = (1, 1) 62 | ) -> Counter[str]: 63 | """Get ngram counts for a document. The ngram range is inclusive. 64 | 65 | Parameters 66 | ---------- 67 | doc: 68 | Document to extract ngrams from. 69 | ngram_range: 70 | Inclusive range of ngram sizes to extract. 71 | 72 | Examples 73 | -------- 74 | >>> get_ngram_counts("Red roses red.", ngram_range=(1, 2)) 75 | Counter({'red': 2, 'roses': 1, 'red roses': 1, 'roses red': 1}) 76 | """ 77 | ngram_counts: Counter[str] = Counter() 78 | tokens = tokenize(doc) 79 | for size in range(ngram_range[0], ngram_range[1] + 1): 80 | ngram_counts += Counter(extract_ngrams(tokens, size)) 81 | return ngram_counts 82 | 83 | 84 | def normalize_csr_rows(X: sp.csr_matrix, norm: str = "l1") -> sp.csr_matrix: 85 | """Normalize rows of a CSR matrix in place. 86 | 87 | Parameters 88 | ---------- 89 | X: 90 | CSR matrix to normalize. 91 | norm: 92 | Norm to use for normalization. Either "l1" or "l2". 93 | 94 | Examples 95 | -------- 96 | >>> X = sp.csr_matrix([[1, 2], [3, 4]], dtype=np.float64) 97 | >>> normalize_csr_rows(X, norm="l1").toarray() 98 | array([[0.33333333, 0.66666667], 99 | [0.42857143, 0.57142857]]) 100 | >>> normalize_csr_rows(X, norm="l2").toarray() 101 | array([[0.4472136 , 0.89442719], 102 | [0.6 , 0.8 ]]) 103 | """ 104 | norm_func = { 105 | "l1": lambda x: np.abs(x).sum(), 106 | "l2": lambda x: np.sqrt((x**2).sum()), 107 | }[norm] 108 | 109 | for i in range(X.shape[0]): 110 | if X[i].sum() == 0.0: 111 | continue 112 | 113 | X[i, :] /= norm_func(X[i].data) 114 | return X 115 | 116 | 117 | @dataclass 118 | class TfidfConfig: 119 | """Configuration for TfidfVectorizer. 120 | 121 | For more information on tf-idf, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html 122 | 123 | Parameters 124 | ---------- 125 | max_features: 126 | Maximum number of features to keep. If None, all features are kept. 127 | ngram_range: 128 | Inclusive range of ngram sizes to extract. 129 | smooth_idf: 130 | Smooth idf weights by adding a constant 1 to the numerator and denominator 131 | of the idf as if an extra document was seen containing every term once, 132 | preventing zero divisions. 133 | vocabulary: 134 | Vocabulary to use. If None, the vocabulary is inferred from the data. 135 | norm: 136 | Normalization to use for the tfidf matrix. Either "l1" or "l2". 137 | sublinear_tf: 138 | Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). 139 | """ 140 | 141 | max_features: Optional[int] = None 142 | ngram_range: Tuple[int, int] = (1, 1) 143 | smooth_idf: bool = True 144 | vocabulary: Optional[Dict[str, int]] = None 145 | norm: Optional[Literal["l1", "l2"]] = None 146 | sublinear_tf: bool = False 147 | 148 | 149 | class TfidfVectorizer(BaseModel): 150 | r"""A simple term frequency-inverse document frequency (tf-idf) vectorizer 151 | that can be loaded from and serialized to JSON. 152 | 153 | This implementation replicates the behavior of scikit-learn's (as of 1.3.2), 154 | but only supports a subset of its parameters. 155 | 156 | For more information on tf-idf, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html 157 | 158 | Parameters 159 | ---------- 160 | config: 161 | Configuration for the vectorizer. 162 | idf_vector: 163 | Precomputed idf vector. If None, it is computed from the data. 164 | vocabulary: 165 | Vocabulary to use. If None, the vocabulary is inferred from the data. 166 | 167 | Examples 168 | -------- 169 | >>> docs = ["The quick brown fox", "jumps over", "the lazy dog."] 170 | >>> vectorizer = TfidfVectorizer(config=TfidfConfig()) 171 | >>> tfidf = vectorizer.fit_transform(docs) 172 | >>> tfidf.shape 173 | (3, 8) 174 | """ 175 | 176 | config: TfidfConfig 177 | idf_vector: List[float] = list() 178 | vocabulary: Dict[str, int] = Field(default_factory=dict) 179 | 180 | def _get_idf_vector( 181 | self, ngram_counts: List[Counter[str]], vocab: Dict[str, int] 182 | ) -> List[float]: 183 | """Compute the idf vector for the whole corpus from a list of 184 | ngram counts from each document. 185 | 186 | Parameters 187 | ---------- 188 | ngram_counts: 189 | List of ngram counts for each document. 190 | vocab: 191 | Vocabulary to use. Each ngram key has an integer value used as the 192 | column index of the output matrix. 193 | """ 194 | idf_vector = np.zeros(len(vocab), dtype=np.float64) 195 | for record in ngram_counts: 196 | idf_vector[[vocab[t] for t in record.keys() if t in vocab]] += 1 197 | n_docs = len(ngram_counts) + int(self.config.smooth_idf) 198 | idf_vector += int(self.config.smooth_idf) 199 | idf_vector = 1 + np.log(n_docs / (idf_vector)) 200 | return list(idf_vector) 201 | 202 | def _get_tf_matrix( 203 | self, ngram_counts: List[Counter[str]], vocab: Dict[str, int] 204 | ) -> sp.csr_matrix: 205 | """Compute the term frequency matrix for the whole corpus from a 206 | list of ngram counts from each document. 207 | 208 | Parameters 209 | ---------- 210 | ngram_counts: 211 | List of ngram counts for each document (rows of the output matrix). 212 | vocab: 213 | Vocabulary to use. Each ngram key has an integer value used as the 214 | column index of the output matrix. 215 | """ 216 | tf_matrix = sp.lil_matrix( 217 | (len(ngram_counts), len(vocab)), dtype=np.float64 218 | ) 219 | for idx, record in enumerate(ngram_counts): 220 | pairs = record.items() 221 | counts = [v for _, v in pairs] 222 | tf_matrix[idx, [vocab[t] for t, _ in pairs]] = [c for c in counts] 223 | tf_matrix = tf_matrix.tocsr() 224 | if self.config.sublinear_tf: 225 | # applies log in place 226 | np.log(tf_matrix.data, tf_matrix.data) # type: ignore 227 | tf_matrix.data += 1 # type: ignore 228 | return tf_matrix 229 | 230 | def _get_tfidf( 231 | self, ngram_counts: List[Counter[str]], vocab: Dict[str, int] 232 | ) -> sp.csr_matrix: 233 | """Compute the tfidf matrix over the whole corpus from a list of 234 | ngram counts from each document. 235 | 236 | Parameters 237 | ---------- 238 | ngram_counts: 239 | List of ngram counts for each document. 240 | vocab: 241 | Vocabulary to use. Each ngram key has an integer value used as the 242 | column index of the output matrix. 243 | """ 244 | tf_matrix: sp.csr_matrix = self._get_tf_matrix( 245 | ngram_counts, vocab=vocab 246 | ) 247 | 248 | tfidf_matrix = tf_matrix.multiply(np.array(self.idf_vector)) # type: ignore 249 | return tfidf_matrix.tocsr() # type: ignore 250 | 251 | def _get_vocabulary( 252 | self, ngram_counts: Iterable[Counter[str]] 253 | ) -> dict[str, int]: 254 | """Get the vocabulary from a list of ngram counts. The vocabulary 255 | is a mapping from ngrams to integer used as column indices in the 256 | tfidf matrix. 257 | 258 | Parameters 259 | ---------- 260 | ngram_counts: 261 | List of ngram counts for each document. 262 | """ 263 | counts_corpus = reduce(lambda x, y: x | y, ngram_counts).most_common() 264 | if self.config.max_features is not None: 265 | counts_corpus = counts_corpus[: self.config.max_features] 266 | return { 267 | t[0]: i 268 | for i, t in enumerate(sorted(counts_corpus, key=lambda x: x[0])) 269 | } 270 | 271 | def fit(self, data: Iterable[str]): 272 | """Fit the vectorizer to a list of documents. 273 | 274 | Parameters 275 | ---------- 276 | data: 277 | List of documents contents to fit the vectorizer to.""" 278 | counts_records: List[Counter[str]] = [ 279 | get_ngram_counts(doc, self.config.ngram_range) for doc in data 280 | ] 281 | vocab = self.config.vocabulary or self._get_vocabulary(counts_records) 282 | self.idf_vector = self._get_idf_vector(counts_records, vocab=vocab) 283 | self.vocabulary = vocab 284 | 285 | def transform(self, data: Iterable[str]) -> sp.csr_matrix: 286 | """Transform a list of documents into a tfidf matrix. 287 | The model must be fit before calling this method. 288 | 289 | Parameters 290 | ---------- 291 | data: 292 | List of documents contents to transform. 293 | """ 294 | if not self.vocabulary: 295 | raise ValueError("Vocabulary is empty. Call `fit` first.") 296 | counts_records = [ 297 | get_ngram_counts(doc, self.config.ngram_range) for doc in data 298 | ] 299 | counts_records = [ 300 | Counter({k: v for k, v in doc.items() if k in self.vocabulary}) 301 | for doc in counts_records 302 | ] 303 | tfidf = self._get_tfidf(counts_records, vocab=self.vocabulary) 304 | if self.config.norm is not None: 305 | return normalize_csr_rows(tfidf, norm=self.config.norm) 306 | return tfidf 307 | 308 | def fit_transform(self, data: Iterable[str]) -> sp.csr_matrix: 309 | """Fit the vectorizer to a list of documents and transform them 310 | into a tfidf matrix. 311 | 312 | Parameters 313 | ---------- 314 | data: 315 | List of documents contents to fit the vectorizer to and transform. 316 | """ 317 | self.fit(list(data)) 318 | return self.transform(data) 319 | -------------------------------------------------------------------------------- /gimie/extractors/gitlab.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from __future__ import annotations 18 | from dataclasses import dataclass 19 | import os 20 | import requests 21 | from datetime import datetime 22 | from dateutil.parser import isoparse 23 | from functools import cached_property 24 | from typing import Any, Dict, List, Optional, Union 25 | from urllib.parse import urlparse 26 | from dotenv import load_dotenv 27 | from gimie.io import RemoteResource 28 | from gimie.models import ( 29 | Organization, 30 | Person, 31 | Repository, 32 | ) 33 | from gimie.extractors.abstract import Extractor 34 | from gimie.extractors.common.queries import send_graphql_query, send_rest_query 35 | 36 | load_dotenv() 37 | 38 | 39 | @dataclass 40 | class GitlabExtractor(Extractor): 41 | """Extractor for Gitlab repositories. Uses the Gitlab GraphQL API to 42 | extract metadata into linked data. 43 | url: str 44 | The url of the git repository. 45 | base_url: Optional[str] 46 | The base url of the git remote. 47 | 48 | """ 49 | 50 | url: str 51 | base_url: Optional[str] = None 52 | local_path: Optional[str] = None 53 | 54 | token: Optional[str] = None 55 | 56 | def list_files(self) -> List[RemoteResource]: 57 | """takes the root repository folder and returns the list of files present""" 58 | file_list = [] 59 | file_dict = self._repo_data["repository"]["tree"]["blobs"]["nodes"] 60 | defaultbranchref = self._repo_data["repository"]["rootRef"] 61 | for item in file_dict: 62 | file = RemoteResource( 63 | path=item["name"], 64 | url=f'{self.url}/-/raw/{defaultbranchref}/{item["name"]}', 65 | headers=self._headers, 66 | ) 67 | file_list.append(file) 68 | return file_list 69 | 70 | def extract(self) -> Repository: 71 | """Extract metadata from target Gitlab repository.""" 72 | 73 | # fetch metadata 74 | data = self._repo_data 75 | 76 | # NOTE(identifier): Each Gitlab project has a unique identifier (integer) 77 | # NOTE(author): Fetches only the group directly related to the project 78 | # the group takes the form: parent/subgroup 79 | 80 | # NOTE(contributors): contributors = project members 81 | # who are not owners + those that have written merge requests 82 | # owners are either multiple individuals or a group. If no user 83 | # is marked as owner, contributors are project members or merge 84 | # request authors 85 | repo_meta = dict( 86 | authors=self._safe_extract_author(data), 87 | contributors=self._safe_extract_contributors(data), 88 | date_created=isoparse(data["createdAt"][:-1]), 89 | date_modified=isoparse(data["lastActivityAt"][:-1]), 90 | description=data["description"], 91 | identifier=urlparse(data["id"]).path.split("/")[2], 92 | keywords=data["topics"], 93 | name=self.path, 94 | prog_langs=[lang["name"] for lang in data["languages"]], 95 | url=self.url, 96 | ) 97 | 98 | if data["releases"]["edges"]: 99 | repo_meta["date_published"] = isoparse( 100 | data["releases"]["edges"][0]["node"]["releasedAt"] 101 | ) 102 | 103 | if data["releases"] and (len(data["releases"]["edges"]) > 0): 104 | # go into releases and take the name from the first node (most recent) 105 | version = data["releases"]["edges"][0]["node"]["name"] 106 | repo_meta["version"] = version 107 | repo_meta[ 108 | "download_url" 109 | ] = f"{self.url}/-/archive/{version}/{self.path.split('/')[-1]}-{version}.tar.gz" 110 | return Repository(**repo_meta) # type: ignore 111 | 112 | def _safe_extract_author( 113 | self, repo: Dict[str, Any] 114 | ) -> List[Union[Person, Organization]]: 115 | """Extract the author from a GraphQL repository node. 116 | projectMembers is used if available, otherwise the author 117 | is inferred from the project url.""" 118 | members = repo["projectMembers"]["edges"] 119 | if len(members) > 0: 120 | owners = filter( 121 | lambda m: m["node"]["accessLevel"]["stringValue"] == "OWNER", 122 | members, 123 | ) 124 | return [ 125 | self._get_author(owner["node"]["user"]) for owner in owners 126 | ] 127 | 128 | if repo["group"] is not None: 129 | return [self._get_author(repo["group"])] 130 | 131 | # If the author is absent from the GraphQL response (permission bug), 132 | # fallback to the REST API 133 | return [self._user_from_rest(self.path.split("/")[0])] 134 | 135 | def _safe_extract_contributors( 136 | self, repo: dict[str, Any] 137 | ) -> List[Person] | None: 138 | members = [ 139 | user["node"]["user"] 140 | for user in repo["projectMembers"]["edges"] 141 | if user["node"]["accessLevel"]["stringValue"] != "OWNER" 142 | ] 143 | merge_request_authors = [ 144 | author["node"]["author"] 145 | for author in repo["mergeRequests"]["edges"] 146 | ] 147 | contributors = members + merge_request_authors 148 | # Drop duplicate (unhashable) dicts by "id" key 149 | uniq_contrib = list({c["id"]: c for c in contributors}.values()) 150 | return [self._get_user(contrib) for contrib in uniq_contrib] 151 | 152 | @cached_property 153 | def _repo_data(self) -> Dict[str, Any]: 154 | """Fetch repository metadata from GraphQL endpoint.""" 155 | data = {"path": self.path} 156 | project_query = """ 157 | query project_query($path: ID!) { 158 | project(fullPath: $path) { 159 | name 160 | id 161 | description 162 | createdAt 163 | lastActivityAt 164 | group { 165 | id 166 | name 167 | description 168 | avatarUrl 169 | webUrl 170 | } 171 | languages { 172 | name 173 | share 174 | } 175 | topics 176 | projectMembers { 177 | edges { 178 | node { 179 | id 180 | accessLevel { 181 | stringValue 182 | } 183 | user { 184 | id 185 | name 186 | username 187 | publicEmail 188 | webUrl 189 | } 190 | } 191 | } 192 | } 193 | mergeRequests{ 194 | edges { 195 | node { 196 | author { 197 | id 198 | name 199 | username 200 | publicEmail 201 | webUrl 202 | } 203 | } 204 | } 205 | } 206 | repository { 207 | rootRef 208 | tree{ 209 | blobs{ 210 | nodes { 211 | name 212 | webUrl 213 | } 214 | } 215 | } 216 | } 217 | releases { 218 | edges { 219 | node { 220 | name 221 | releasedAt 222 | } 223 | } 224 | } 225 | } 226 | } 227 | """ 228 | response = send_graphql_query( 229 | self.graphql_endpoint, project_query, data, self._headers 230 | ) 231 | if "errors" in response: 232 | raise ValueError(response["errors"]) 233 | 234 | return response["data"]["project"] 235 | 236 | @cached_property 237 | def _headers(self) -> Any: 238 | """Set authentication headers for Gitlab API requests.""" 239 | try: 240 | if not self.token: 241 | self.token = os.environ.get("GITLAB_TOKEN") 242 | assert self.token 243 | headers = {"Authorization": f"token {self.token}"} 244 | 245 | login = requests.get(f"{self.rest_endpoint}/user", headers=headers) 246 | assert login.json().get("login") 247 | except AssertionError: 248 | return {} 249 | else: 250 | return headers 251 | 252 | def _get_author(self, node: Dict[str, Any]) -> Union[Organization, Person]: 253 | """Given the GraphQL node for a repository owner, 254 | return the author as a Person or Organization object.""" 255 | # Is this the best test? 256 | if "username" in node: 257 | return self._get_user(node) 258 | return self._get_organization(node) 259 | 260 | def _get_organization(self, node: Dict[str, Any]) -> Organization: 261 | """Extract details from a GraphQL organization node.""" 262 | return Organization( 263 | _id=node["webUrl"], 264 | name=node["name"], 265 | description=node.get("description"), 266 | logo=node.get("avatarUrl"), 267 | ) 268 | 269 | def _get_user(self, node: Dict[str, Any]) -> Person: 270 | """Extract details from a GraphQL user node.""" 271 | return Person( 272 | _id=node["webUrl"], 273 | identifier=node["username"], 274 | name=node.get("name"), 275 | email=node.get("publicEmail"), 276 | ) 277 | 278 | def _user_from_rest(self, username: str) -> Person: 279 | """Given a username, use the REST API to retrieve the Person object.""" 280 | 281 | author = send_rest_query( 282 | self.rest_endpoint, 283 | f"/users?username={username}", 284 | self._headers, 285 | ) 286 | if isinstance(author, list): 287 | author = author[0] 288 | 289 | return Person( 290 | _id=author["web_url"], 291 | identifier=author["username"], 292 | name=author.get("name"), 293 | ) 294 | 295 | @property 296 | def rest_endpoint(self) -> str: 297 | return f"{self.base}/api/v4/" 298 | 299 | @property 300 | def graphql_endpoint(self) -> str: 301 | return f"{self.base}/api" 302 | -------------------------------------------------------------------------------- /gimie/extractors/github.py: -------------------------------------------------------------------------------- 1 | # Gimie 2 | # Copyright 2022 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from __future__ import annotations 18 | 19 | from dataclasses import dataclass 20 | from dateutil.parser import isoparse 21 | from functools import cached_property 22 | import os 23 | import requests 24 | from typing import Any, Dict, List, Optional, Union 25 | from urllib.parse import urlparse 26 | from dotenv import load_dotenv 27 | 28 | from gimie.extractors.abstract import Extractor 29 | from gimie.models import ( 30 | Organization, 31 | Person, 32 | Repository, 33 | ) 34 | 35 | from gimie.io import RemoteResource 36 | from gimie.extractors.common.queries import ( 37 | send_rest_query, 38 | send_graphql_query, 39 | ) 40 | 41 | GH_API = "https://api.github.com" 42 | load_dotenv() 43 | 44 | 45 | def query_contributors( 46 | url: str, headers: Dict[str, str] 47 | ) -> List[Dict[str, Any]]: 48 | """Queries the list of contributors of target repository 49 | using GitHub's REST and GraphQL APIs. Returns a list of GraphQL User nodes. 50 | NOTE: This is a workaround for the lack of a contributors field in the GraphQL API. 51 | """ 52 | owner, name = urlparse(url).path.strip("/").split("/") 53 | # Get contributors (available in the REST API but not GraphQL) 54 | data = f"repos/{owner}/{name}/contributors" 55 | contributors = send_rest_query(GH_API, data, headers=headers) 56 | ids = [contributor["node_id"] for contributor in contributors] 57 | # Get all contributors' metadata in 1 GraphQL query 58 | users_query = """ 59 | query users($ids: [ID!]!) { 60 | nodes(ids: $ids) { 61 | ... on User { 62 | avatarUrl 63 | company 64 | login 65 | name 66 | organizations(first: 100) { 67 | nodes { 68 | avatarUrl 69 | description 70 | login 71 | name 72 | url 73 | } 74 | } 75 | url 76 | } 77 | } 78 | }""" 79 | 80 | contributors = send_graphql_query( 81 | GH_API, users_query, data={"ids": ids}, headers=headers 82 | ) 83 | # Drop empty users (e.g. dependabot) 84 | return [user for user in contributors["data"]["nodes"] if user] 85 | 86 | 87 | @dataclass 88 | class GithubExtractor(Extractor): 89 | """Extractor for GitHub repositories. Uses the GitHub GraphQL API to 90 | extract metadata into linked data. 91 | url: str 92 | The url of the git repository. 93 | base_url: Optional[str] 94 | The base url of the git remote. 95 | """ 96 | 97 | url: str 98 | base_url: Optional[str] = None 99 | local_path: Optional[str] = None 100 | 101 | token: Optional[str] = None 102 | 103 | def list_files(self) -> List[RemoteResource]: 104 | """takes the root repository folder and returns the list of files present""" 105 | file_list = [] 106 | file_dict = self._repo_data["object"]["entries"] 107 | repo_url = self._repo_data["url"] 108 | defaultbranchref = self._repo_data["defaultBranchRef"]["name"] 109 | 110 | for item in file_dict: 111 | file = RemoteResource( 112 | path=item["name"], 113 | url=f'{repo_url}/raw/{defaultbranchref}/{item["path"]}', 114 | headers=self._headers, 115 | ) 116 | file_list.append(file) 117 | return file_list 118 | 119 | def extract(self) -> Repository: 120 | """Extract metadata from target GitHub repository.""" 121 | data = self._repo_data 122 | 123 | repo_meta = dict( 124 | authors=[self._get_author(data["owner"])], 125 | contributors=self._fetch_contributors(), 126 | date_created=isoparse(data["createdAt"][:-1]), 127 | date_modified=isoparse(data["updatedAt"][:-1]), 128 | description=data["description"], 129 | name=self.path, 130 | keywords=self._get_keywords(*data["repositoryTopics"]["nodes"]), 131 | url=self.url, 132 | ) 133 | if data["parent"]: 134 | repo_meta["parent_repository"] = data["parent"]["url"] 135 | 136 | if data["latestRelease"]: 137 | repo_meta["date_published"] = isoparse( 138 | data["latestRelease"]["publishedAt"] 139 | ) 140 | 141 | if data["primaryLanguage"] is not None: 142 | repo_meta["prog_langs"] = [data["primaryLanguage"]["name"]] 143 | 144 | if data["latestRelease"]: 145 | version = data["latestRelease"]["name"] 146 | download_url = f"{self.url}/archive/refs/tags/{version}.tar.gz" 147 | repo_meta["download_url"] = download_url 148 | repo_meta["version"] = version 149 | 150 | return Repository(**repo_meta) # type: ignore 151 | 152 | @cached_property 153 | def _repo_data(self) -> Dict[str, Any]: 154 | """Repository metadata fetched from GraphQL endpoint.""" 155 | owner, name = self.path.split("/") 156 | data = {"owner": owner, "name": name} 157 | repo_query = """ 158 | query repo($owner: String!, $name: String!) { 159 | repository(name: $name, owner: $owner) { 160 | url 161 | parent {url} 162 | createdAt 163 | description 164 | latestRelease { 165 | publishedAt 166 | name 167 | } 168 | defaultBranchRef { 169 | name 170 | } 171 | object(expression: "HEAD:") { 172 | ... on Tree { 173 | 174 | entries { 175 | name 176 | path 177 | } 178 | } 179 | } 180 | mentionableUsers(first: 100) { 181 | nodes { 182 | login 183 | name 184 | avatarUrl 185 | company 186 | organizations(first: 100) { 187 | nodes { 188 | avatarUrl 189 | description 190 | login 191 | name 192 | url 193 | } 194 | } 195 | url 196 | } 197 | } 198 | name 199 | owner { 200 | avatarUrl 201 | login 202 | url 203 | ... on User { 204 | company 205 | name 206 | organizations(first: 100) { 207 | nodes { 208 | avatarUrl 209 | description 210 | login 211 | name 212 | url 213 | } 214 | } 215 | } 216 | ... on Organization { 217 | name 218 | description 219 | } 220 | } 221 | primaryLanguage { 222 | name 223 | } 224 | repositoryTopics(first: 10) { 225 | nodes { 226 | topic { 227 | name 228 | } 229 | } 230 | } 231 | updatedAt 232 | url 233 | } 234 | } 235 | """ 236 | response = send_graphql_query(GH_API, repo_query, data, self._headers) 237 | 238 | if "errors" in response: 239 | raise ValueError(response["errors"]) 240 | 241 | return response["data"]["repository"] 242 | 243 | def _fetch_contributors(self) -> List[Person]: 244 | """Queries the GitHub GraphQL API to extract contributors through the commit list. 245 | NOTE: This is a workaround for the lack of a contributors field in the GraphQL API. 246 | """ 247 | contributors = [] 248 | resp = query_contributors(self.url, self._headers) 249 | for user in resp: 250 | contributors.append(self._get_user(user)) 251 | return list(contributors) 252 | 253 | @cached_property 254 | def _headers(self) -> Any: 255 | """Set authentication headers for GitHub API requests.""" 256 | try: 257 | if not self.token: 258 | self.token = os.environ.get("GITHUB_TOKEN") 259 | if not self.token: 260 | raise ValueError( 261 | "GitHub token not found. Please set the GITHUB_TOKEN environment variable " 262 | "with your GitHub personal access token." 263 | ) 264 | headers = {"Authorization": f"token {self.token}"} 265 | 266 | login = requests.get(f"{GH_API}/user", headers=headers) 267 | if not login.ok or not login.json().get("login"): 268 | raise ValueError( 269 | "GitHub authentication failed. Please check that your GITHUB_TOKEN is valid." 270 | ) 271 | return headers 272 | except requests.exceptions.RequestException as e: 273 | raise ConnectionError(f"Failed to connect to GitHub API: {str(e)}") 274 | 275 | def _get_keywords(self, *nodes: Dict[str, Any]) -> List[str]: 276 | """Extract names from GraphQL topic nodes.""" 277 | return [node["topic"]["name"] for node in nodes] 278 | 279 | def _get_organization(self, node: Dict[str, Any]) -> Organization: 280 | """Extract details from a GraphQL organization node.""" 281 | return Organization( 282 | _id=node["url"], 283 | name=node["login"], 284 | description=node["description"], 285 | legal_name=node["name"], 286 | logo=node["avatarUrl"], 287 | ) 288 | 289 | def _get_author(self, node: Dict[str, Any]) -> Union[Organization, Person]: 290 | """Given the GraphQL node for a repository owner, 291 | return the author as a Person or Organization object.""" 292 | 293 | if "organizations" in node: 294 | return self._get_user(node) 295 | 296 | return self._get_organization(node) 297 | 298 | def _get_user(self, node: Dict[str, Any]) -> Person: 299 | """Extract details from a GraphQL user node.""" 300 | # Get user's affiliations 301 | orgs = [ 302 | self._get_organization(org) 303 | for org in node["organizations"]["nodes"] 304 | ] 305 | return Person( 306 | _id=node["url"], 307 | identifier=node["login"], 308 | name=node["name"], 309 | affiliations=orgs, 310 | ) 311 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /gimie/parsers/license/data/tfidf_vectorizer.json: -------------------------------------------------------------------------------- 1 | {"config":{"max_features":700,"ngram_range":[1,2],"smooth_idf":true,"vocabulary":null,"norm":"l2","sublinear_tf":true},"idf_vector":[1.2724146003209924,1.3457458734065422,5.174387269895637,2.535329940280379,2.535329940280379,4.481240089335692,4.768922161787472,4.258096538021482,5.174387269895637,2.7764919970972666,4.768922161787472,2.7764919970972666,2.6486586255873816,4.768922161787472,5.174387269895637,3.921624301400269,4.258096538021482,5.174387269895637,2.203972804325936,2.203972804325936,1.4608152031913293,1.4367176516122688,2.0173868487455238,3.5649493574615367,4.258096538021482,4.768922161787472,4.768922161787472,2.3121863889661687,5.174387269895637,2.2840155119994723,5.174387269895637,4.768922161787472,1.9555114450274362,2.7320402345264325,2.2299482907291965,3.7880929087757464,2.341173925839421,2.3710268889891024,3.921624301400269,1.0472528848505456,3.921624301400269,2.6486586255873816,1.550046336919272,3.921624301400269,4.481240089335692,1.1670540846631663,3.159484249353372,2.689480620107637,2.6094379124341005,3.228477120840324,2.203972804325936,1.4855078157817008,2.3121863889661687,2.7764919970972666,1.807091439909163,2.083344816537321,3.3826278006675823,3.3826278006675823,3.3025850929940455,2.038893053966487,1.6190392084062235,1.149035579160488,1.6190392084062235,1.303186258987746,1.023347363996991,2.083344816537321,1.6628418310646165,1.789997006549863,1.3566749439387324,1.3788980807234426,1.0312525435041044,2.401798547655856,2.466337068793427,4.768922161787472,3.0343211063993665,1.8244831826210324,5.174387269895637,1.4855078157817008,2.0173868487455238,2.6094379124341005,2.535329940280379,2.083344816537321,2.689480620107637,4.258096538021482,4.258096538021482,4.258096538021482,1.550046336919272,1.916290731874155,1.5108256237659907,1.1854032233313627,2.2299482907291965,3.3826278006675823,1.252413933614323,1.5368011101692514,1.498086597988561,2.6486586255873816,4.258096538021482,4.481240089335692,1.590868331439527,2.689480620107637,4.481240089335692,1.8244831826210324,2.0608719606852626,1.0635134057223259,1.6931471805599454,5.174387269895637,2.3121863889661687,5.174387269895637,1.2328054622259468,1.131336002061087,4.258096538021482,2.923095471289142,2.3121863889661687,5.174387269895637,1.252413933614323,1.916290731874155,2.923095471289142,4.481240089335692,1.8421827597204332,1.7731898882334818,4.481240089335692,4.481240089335692,2.1063343347620203,2.3121863889661687,1.7404000654104907,1.7086513670959107,1.604854573414267,4.768922161787472,1.2231435513142097,2.178654996341646,4.768922161787472,2.1539623837512747,2.466337068793427,2.203972804325936,2.0608719606852626,3.469639177657212,2.1539623837512747,2.689480620107637,3.228477120840324,1.7566605862822713,4.075774981227527,4.481240089335692,3.3025850929940455,4.258096538021482,3.469639177657212,5.174387269895637,2.7764919970972666,2.8230120127321596,4.481240089335692,4.481240089335692,1.3136575588550414,2.571697584451253,2.401798547655856,1.9555114450274362,2.535329940280379,2.923095471289142,1.7566605862822713,3.094945728215801,3.670309873119363,3.3025850929940455,3.5649493574615367,1.424883193965266,1.6480267452794757,1.2135741002980591,1.3566749439387324,1.0968498259899175,2.3121863889661687,2.8718021769015913,2.6486586255873816,3.670309873119363,1.9555114450274362,3.228477120840324,4.075774981227527,4.481240089335692,3.921624301400269,1.8421827597204332,3.469639177657212,5.174387269895637,3.469639177657212,4.481240089335692,1.149035579160488,1.550046336919272,5.174387269895637,5.174387269895637,4.258096538021482,4.258096538021482,3.670309873119363,1.7731898882334818,2.689480620107637,1.8421827597204332,1.677879708429157,5.174387269895637,4.258096538021482,4.481240089335692,2.7764919970972666,3.3025850929940455,3.7880929087757464,4.075774981227527,3.921624301400269,1.7566605862822713,1.1947056159936762,2.9771626925594177,1.6931471805599454,1.5770750093071912,2.8230120127321596,1.3788980807234426,1.7086513670959107,2.8718021769015913,1.4486938426589844,1.677879708429157,1.633427945858323,1.633427945858323,4.481240089335692,3.670309873119363,5.174387269895637,4.481240089335692,4.481240089335692,4.481240089335692,3.469639177657212,1.424883193965266,2.433547245970436,1.5237290286018985,3.3826278006675823,1.807091439909163,4.481240089335692,3.0343211063993665,5.174387269895637,5.174387269895637,4.481240089335692,4.481240089335692,1.5237290286018985,2.0173868487455238,4.481240089335692,5.174387269895637,2.689480620107637,2.7320402345264325,1.807091439909163,3.670309873119363,1.633427945858323,4.481240089335692,3.0343211063993665,1.8244831826210324,1.6480267452794757,1.252413933614323,4.258096538021482,4.481240089335692,1.0392207131532814,1.149035579160488,1.0717439048588413,1.5368011101692514,1.252413933614323,2.401798547655856,2.571697584451253,2.401798547655856,5.174387269895637,5.174387269895637,5.174387269895637,1.6480267452794757,2.6486586255873816,2.8718021769015913,1.1226023220923322,1.5770750093071912,2.1298648321722142,2.6094379124341005,2.401798547655856,2.8230120127321596,3.3826278006675823,2.8230120127321596,4.075774981227527,3.469639177657212,2.038893053966487,5.174387269895637,2.401798547655856,5.174387269895637,2.923095471289142,5.174387269895637,3.0343211063993665,1.6190392084062235,1.2928234719521994,1.4608152031913293,1.4608152031913293,2.571697584451253,1.4486938426589844,1.9963334395476915,2.2840155119994723,1.3788980807234426,1.1580042491432483,1.916290731874155,1.677879708429157,1.0472528848505456,1.590868331439527,1.2825669717850103,3.094945728215801,1.149035579160488,1.590868331439527,1.5368011101692514,1.0717439048588413,1.677879708429157,2.571697584451253,3.7880929087757464,3.5649493574615367,5.174387269895637,1.9757141523449557,1.9757141523449557,2.3121863889661687,1.023347363996991,2.1063343347620203,1.8602012652231115,1.3788980807234426,1.8244831826210324,4.768922161787472,1.2928234719521994,2.2840155119994723,4.258096538021482,4.481240089335692,2.2299482907291965,5.174387269895637,1.5634693572514127,3.3826278006675823,4.481240089335692,4.481240089335692,1.633427945858323,3.469639177657212,3.228477120840324,3.228477120840324,1.3677247801253174,3.469639177657212,2.8718021769015913,3.921624301400269,4.768922161787472,3.921624301400269,3.7880929087757464,3.7880929087757464,3.670309873119363,4.481240089335692,1.1854032233313627,2.083344816537321,2.7320402345264325,1.807091439909163,1.677879708429157,1.916290731874155,1.604854573414267,2.083344816537321,1.8421827597204332,5.174387269895637,5.174387269895637,4.768922161787472,5.174387269895637,2.3710268889891024,1.9757141523449557,2.256616537811358,3.0343211063993665,2.571697584451253,1.4730852957831435,4.481240089335692,1.3242396681855786,1.4367176516122688,2.0173868487455238,2.1298648321722142,5.174387269895637,5.174387269895637,1.390197635977376,1.7086513670959107,2.401798547655856,2.466337068793427,5.174387269895637,1.2425616371713113,1.550046336919272,3.5649493574615367,2.6486586255873816,1.5634693572514127,2.178654996341646,1.7404000654104907,1.390197635977376,1.4486938426589844,2.8230120127321596,1.5770750093071912,4.481240089335692,2.466337068793427,1.262364264467491,4.481240089335692,1.6480267452794757,5.174387269895637,5.174387269895637,1.2425616371713113,1.8972425369034605,5.174387269895637,2.689480620107637,1.789997006549863,1.055350095083165,5.174387269895637,5.174387269895637,1.0635134057223259,1.2135741002980591,1.4486938426589844,2.0173868487455238,2.1063343347620203,1.8244831826210324,5.174387269895637,1.0312525435041044,1.6628418310646165,1.2825669717850103,2.8718021769015913,4.768922161787472,4.481240089335692,5.174387269895637,1.3457458734065422,1.8785504038913081,1.055350095083165,1.1053605156578263,1.5770750093071912,1.1761865682264387,2.6486586255873816,1.8244831826210324,1.789997006549863,1.498086597988561,2.1063343347620203,2.401798547655856,1.0392207131532814,1.807091439909163,2.923095471289142,1.498086597988561,4.075774981227527,1.8244831826210324,1.550046336919272,1.5770750093071912,2.8718021769015913,1.4855078157817008,3.159484249353372,5.174387269895637,3.921624301400269,2.8718021769015913,5.174387269895637,5.174387269895637,1.1139442593492177,1.3349349573023266,4.481240089335692,4.481240089335692,1.3136575588550414,2.7320402345264325,3.921624301400269,3.670309873119363,4.481240089335692,3.3025850929940455,1.5237290286018985,5.174387269895637,5.174387269895637,1.6931471805599454,3.7880929087757464,1.1139442593492177,1.8421827597204332,1.590868331439527,4.481240089335692,1.5237290286018985,1.7404000654104907,1.6190392084062235,2.923095471289142,1.7566605862822713,1.8972425369034605,4.768922161787472,2.5002386204691085,5.174387269895637,2.1298648321722142,1.8602012652231115,3.7880929087757464,1.8785504038913081,2.178654996341646,2.7764919970972666,2.401798547655856,2.1539623837512747,3.921624301400269,1.7566605862822713,2.341173925839421,1.7566605862822713,1.055350095083165,1.390197635977376,1.9357088177312565,4.481240089335692,4.481240089335692,5.174387269895637,1.5237290286018985,2.0173868487455238,2.038893053966487,4.768922161787472,4.768922161787472,3.469639177657212,4.481240089335692,4.258096538021482,5.174387269895637,2.571697584451253,2.0173868487455238,2.083344816537321,3.7880929087757464,2.433547245970436,1.6628418310646165,2.8230120127321596,2.6486586255873816,1.7243997240640498,1.8602012652231115,2.038893053966487,2.1298648321722142,1.916290731874155,1.4608152031913293,1.262364264467491,5.174387269895637,4.768922161787472,5.174387269895637,2.923095471289142,3.3826278006675823,3.921624301400269,1.6931471805599454,2.0608719606852626,3.3826278006675823,2.203972804325936,2.401798547655856,1.1580042491432483,1.6480267452794757,3.670309873119363,1.789997006549863,1.4486938426589844,1.0472528848505456,2.0173868487455238,2.7764919970972666,1.677879708429157,1.6190392084062235,1.1580042491432483,3.670309873119363,2.401798547655856,1.3566749439387324,3.228477120840324,2.7320402345264325,2.9771626925594177,4.075774981227527,2.3121863889661687,1.424883193965266,2.535329940280379,5.174387269895637,1.4367176516122688,1.9963334395476915,4.258096538021482,5.174387269895637,1.1761865682264387,2.341173925839421,3.7880929087757464,5.174387269895637,3.670309873119363,5.174387269895637,5.174387269895637,5.174387269895637,5.174387269895637,2.923095471289142,3.0343211063993665,4.481240089335692,2.0608719606852626,1.424883193965266,1.7731898882334818,1.5368011101692514,2.0608719606852626,1.6628418310646165,1.088410957344053,1.8785504038913081,2.5002386204691085,5.174387269895637,1.252413933614323,3.670309873119363,1.5770750093071912,1.0312525435041044,3.5649493574615367,3.469639177657212,2.535329940280379,3.921624301400269,2.2299482907291965,2.5002386204691085,1.590868331439527,3.159484249353372,2.571697584451253,4.481240089335692,4.768922161787472,4.481240089335692,1.2724146003209924,4.258096538021482,5.174387269895637,2.8718021769015913,2.923095471289142,5.174387269895637,2.8230120127321596,3.670309873119363,4.258096538021482,4.258096538021482,1.9757141523449557,2.689480620107637,3.670309873119363,2.5002386204691085,2.341173925839421,2.535329940280379,2.8230120127321596,4.258096538021482,1.604854573414267,4.075774981227527,4.768922161787472,2.7320402345264325,3.3025850929940455,2.6094379124341005,4.768922161787472,1.7566605862822713,1.604854573414267,4.075774981227527,2.8718021769015913,5.174387269895637,5.174387269895637,1.4608152031913293,2.535329940280379,2.178654996341646,1.8421827597204332,2.1298648321722142,3.0343211063993665,2.0608719606852626,1.8785504038913081,1.7731898882334818,1.9555114450274362,1.0312525435041044,3.0343211063993665,4.075774981227527,1.424883193965266,3.7880929087757464,1.5368011101692514,1.6480267452794757,1.0312525435041044,3.670309873119363,1.9963334395476915,1.390197635977376,2.3121863889661687,1.0717439048588413,1.633427945858323,1.2328054622259468,4.481240089335692,1.9555114450274362,4.258096538021482,1.2825669717850103,2.2299482907291965,1.4367176516122688,1.4730852957831435,4.481240089335692,3.7880929087757464,2.256616537811358,2.6486586255873816,2.2299482907291965,1.0312525435041044,1.3242396681855786,1.3457458734065422,2.1298648321722142,1.916290731874155,2.2840155119994723,3.5649493574615367,1.7404000654104907,2.9771626925594177,1.303186258987746,1.7086513670959107,1.7243997240640498,1.9757141523449557,1.1053605156578263,1.3677247801253174,2.8230120127321596,3.094945728215801,1.7404000654104907,1.2328054622259468,1.4016263318009987,2.535329940280379,1.8785504038913081,2.1539623837512747,1.6190392084062235,1.0968498259899175,2.178654996341646,2.1539623837512747,1.1854032233313627,1.8972425369034605,1.0635134057223259,1.6190392084062235,1.590868331439527,5.174387269895637,2.083344816537321,3.159484249353372,2.433547245970436,2.2299482907291965,2.466337068793427,1.550046336919272,1.9555114450274362,1.390197635977376,3.670309873119363,1.9555114450274362,1.916290731874155,1.5634693572514127,1.7404000654104907,1.5634693572514127,4.768922161787472,3.3025850929940455,4.768922161787472,5.174387269895637,4.258096538021482,4.481240089335692],"vocabulary":{"":0," ":1," %":2," (a)":3," (b)":4," *":5," **":6," -":7," article":8," if":9," means":10," this":11," you":12,"\"third":13,"\"third party\"":14,"\"work":15,"\"work that":16,"%":17,"(a)":18,"(b)":19,"(c)":20,"(including":21,"(or":22,"(or any":23,"*":24,"**":25,"** ":26,"-":27,"- the":28,"1":29,"1 of":30,"16b1":31,"2":32,"22":33,"3":34,"3 of":35,"30":36,"6":37,"6 of":38,"a":39,"a \"work":40,"a contributor":41,"a copy":42,"a covered":43,"a derived":44,"a particular":45,"a product":46,"a program":47,"a recipient":48,"a subsequent":49,"a work":50,"above":51,"access":52,"access to":53,"action":54,"additional":55,"affero":56,"affero general":57,"agency":58,"agree":59,"agreement":60,"all":61,"also":62,"an":63,"and":64,"and ":65,"and all":66,"and conditions":67,"and the":68,"and/or":69,"any":70,"any contributor":71,"any derivative":72,"any extensions":73,"any modifications":74,"any of":75,"any or":76,"any other":77,"any person":78,"any portion":79,"any subsequent":80,"any such":81,"any third":82,"apple":83,"apple and":84,"apple's":85,"applicable":86,"application":87,"apply":88,"are":89,"are not":90,"article":91,"as":92,"as a":93,"at":94,"attribution":95,"attribution information":96,"au":97,"available":98,"b":99,"base":100,"based":101,"based on":102,"be":103,"been":104,"beopen":105,"both":106,"bull":107,"but":108,"by":109,"by apple":110,"by licensor":111,"by such":112,"by sybase":113,"by the":114,"by this":115,"c":116,"ca":117,"can":118,"case":119,"case may":120,"cern":121,"change":122,"changes":123,"charge":124,"claim":125,"claims":126,"cnri":127,"code":128,"code and":129,"code base":130,"code form":131,"code is":132,"code of":133,"code or":134,"code version":135,"combination":136,"combined":137,"combined work":138,"commercial":139,"commercial contributor":140,"commercial distributor":141,"communicate":142,"company":143,"compatible":144,"compatible source":145,"compiled":146,"component":147,"component of":148,"concédant":149,"conditions":150,"contribution":151,"contributions":152,"contributor":153,"contributor and":154,"contributor version":155,"contributors":156,"convey":157,"convey a":158,"convey the":159,"conveying":160,"copies":161,"copies of":162,"copy":163,"copy of":164,"copyright":165,"copyright holder":166,"corporation":167,"corresponding":168,"corresponding source":169,"covered":170,"covered code":171,"covered software":172,"covered source":173,"covered work":174,"create":175,"current":176,"current maintainer":177,"d":178,"d'auteur":179,"damages":180,"data":181,"data files":182,"datagrid":183,"de":184,"de la":185,"deploy":186,"derivative":187,"derivative work":188,"derivative works":189,"derived":190,"derived program":191,"derived work":192,"des":193,"description":194,"developer":195,"developer and":196,"developer original":197,"digital":198,"display":199,"distribute":200,"distribute or":201,"distribute the":202,"distributed":203,"distributed by":204,"distribution":205,"distribution of":206,"distributor":207,"do":208,"do not":209,"does":210,"does not":211,"doit":212,"downstream":213,"downstream distribution":214,"droit":215,"du":216,"du logiciel":217,"e":218,"each":219,"each contributor":220,"either":221,"en":222,"entity":223,"est":224,"et":225,"eu":226,"eu datagrid":227,"european":228,"european union":229,"except":230,"executable":231,"executable code":232,"executable distribution":233,"exhibit":234,"exhibit a":235,"expressly":236,"extensions":237,"extent":238,"externally":239,"federal":240,"file":241,"files":242,"following":243,"font":244,"font software":245,"for":246,"for a":247,"for any":248,"for the":249,"form":250,"forth":251,"forth in":252,"foundation":253,"frameworx":254,"frameworx code":255,"frameworx company":256,"free":257,"free software":258,"freedom":259,"from":260,"from the":261,"general":262,"general public":263,"give":264,"gnu":265,"gnu affero":266,"gnu general":267,"gnu gpl":268,"gnu lesser":269,"governed":270,"governed code":271,"governing":272,"governing jurisdiction":273,"government":274,"government agency":275,"gpl":276,"grant":277,"granted":278,"has":279,"have":280,"having":281,"hereby":282,"hereunder":283,"holder":284,"however":285,"if":286,"if the":287,"if you":288,"in":289,"in a":290,"in any":291,"in part":292,"in the":293,"in this":294,"include":295,"including":296,"information":297,"initial":298,"initial contributor":299,"initial developer":300,"initial work":301,"intellectual":302,"intellectual property":303,"interface":304,"is":305,"is a":306,"is not":307,"it":308,"it is":309,"items":310,"its":311,"jurisdiction":312,"la":313,"la licence":314,"larger":315,"latex":316,"law":317,"le":318,"le concédant":319,"le logiciel":320,"legal":321,"les":322,"lesser":323,"lesser general":324,"liability":325,"libraries":326,"library":327,"library and":328,"library general":329,"library is":330,"library or":331,"library\"":332,"licence":333,"licencié":334,"license":335,"license ":336,"license agreement":337,"license and":338,"license is":339,"license shall":340,"license to":341,"license you":342,"licensed":343,"licensed product":344,"licensed program":345,"licensed software":346,"licensed work":347,"licensee":348,"licenses":349,"licensor":350,"licensor and":351,"licensor or":352,"limitation":353,"logiciel":354,"loss":355,"made":356,"made available":357,"made by":358,"maintainer":359,"maintainer of":360,"make":361,"making":362,"material":363,"matter":364,"matter of":365,"may":366,"may be":367,"may convey":368,"mean":369,"means":370,"means any":371,"means the":372,"modification":373,"modifications":374,"modifications made":375,"modified":376,"modified covered":377,"modified version":378,"modify":379,"module":380,"more":381,"mulan":382,"mulan psl":383,"must":384,"must be":385,"nethack":386,"network":387,"new":388,"no":389,"nokia":390,"non-profit":391,"not":392,"notice":393,"notices":394,"object":395,"object code":396,"obligations":397,"oclc":398,"of":399,"of a":400,"of any":401,"of covered":402,"of exhibit":403,"of licensed":404,"of nethack":405,"of such":406,"of that":407,"of the":408,"of this":409,"offer":410,"on":411,"on a":412,"on the":413,"one":414,"only":415,"open":416,"open source":417,"or":418,"or a":419,"or all":420,"or any":421,"or communicate":422,"or in":423,"or other":424,"or otherwise":425,"ordinary":426,"original":427,"original code":428,"original developer":429,"original software":430,"original work":431,"osl":432,"osl 30":433,"other":434,"otherwise":435,"otherwise using":436,"ou":437,"out":438,"over":439,"over the":440,"package":441,"par":442,"paragraphs":443,"part":444,"part 1":445,"part 6":446,"part of":447,"participant":448,"particular":449,"parties":450,"party":451,"party\"":452,"patent":453,"patent license":454,"permission":455,"permissions":456,"permitted":457,"person":458,"php":459,"portion":460,"portion thereof)":461,"portions":462,"product":463,"product or":464,"products":465,"program":466,"program is":467,"program or":468,"programs":469,"propagate":470,"property":471,"property rights":472,"provide":473,"provided":474,"provided that":475,"provisions":476,"présente":477,"présente licence":478,"psl":479,"public":480,"public license":481,"publicly":482,"python":483,"python 16b1":484,"que":485,"qui":486,"québec":487,"realnetworks":488,"receive":489,"received":490,"recipient":491,"recipient may":492,"recipient's":493,"recipients":494,"redistributions":495,"reproduction":496,"required":497,"requirements":498,"respect":499,"respect to":500,"resulting":501,"right":502,"rights":503,"rights over":504,"rpl":505,"rsv":506,"run":507,"said":508,"secondary":509,"section":510,"sections":511,"server":512,"set":513,"set forth":514,"shall":515,"shall be":516,"shall mean":517,"shall not":518,"so":519,"software":520,"software and":521,"software foundation":522,"software is":523,"software or":524,"source":525,"source ":526,"source and":527,"source code":528,"source license":529,"source or":530,"standard":531,"standard version":532,"states":533,"subject":534,"subject matter":535,"subject software":536,"subject to":537,"subsequent":538,"subsequent contributor":539,"subsequent work":540,"such":541,"such contributor":542,"such recipient":543,"such subsequent":544,"supplement":545,"supplement file":546,"sybase":547,"sybase and":548,"sybase or":549,"system":550,"systems":551,"termes":552,"termination":553,"terms":554,"terms and":555,"terms of":556,"text":557,"than":558,"that":559,"that is":560,"that it":561,"that subsequent":562,"that the":563,"that uses":564,"that you":565,"the":566,"the ":567,"the agreement":568,"the case":569,"the combined":570,"the conditions":571,"the contributor":572,"the copyright":573,"the corresponding":574,"the covered":575,"the current":576,"the derived":577,"the european":578,"the following":579,"the font":580,"the frameworx":581,"the free":582,"the gnu":583,"the governed":584,"the initial":585,"the library":586,"the library\"":587,"the licence":588,"the license":589,"the licensed":590,"the licensee":591,"the licensor":592,"the modified":593,"the notice":594,"the object":595,"the ordinary":596,"the original":597,"the package":598,"the php":599,"the program":600,"the provisions":601,"the recipient":602,"the rpl":603,"the software":604,"the source":605,"the standard":606,"the subject":607,"the subsequent":608,"the supplement":609,"the terms":610,"the work":611,"them":612,"then":613,"thereof":614,"thereof)":615,"these":616,"they":617,"third":618,"third party":619,"this":620,"this agreement":621,"this licence":622,"this license":623,"this package":624,"those":625,"time":626,"to":627,"to ":628,"to a":629,"to any":630,"to copy":631,"to the":632,"to this":633,"to use":634,"toute":635,"trademarks":636,"un":637,"under":638,"under a":639,"under the":640,"under this":641,"une":642,"union":643,"united":644,"united states":645,"upon":646,"use":647,"use of":648,"used":649,"user":650,"users":651,"uses":652,"uses the":653,"using":654,"v":655,"version":656,"version of":657,"versions":658,"versions of":659,"warranties":660,"warranty":661,"we":662,"web":663,"where":664,"whether":665,"which":666,"which you":667,"who":668,"whole":669,"will":670,"with":671,"with a":672,"with respect":673,"with the":674,"within":675,"without":676,"without limitation":677,"work":678,"work (or":679,"work and":680,"work based":681,"work in":682,"work is":683,"work or":684,"works":685,"would":686,"you":687,"you convey":688,"you distribute":689,"you have":690,"you may":691,"you must":692,"your":693,"your extensions":694,"your work":695,"zope":696,"zope corporation":697,"à":698,"être":699}} 2 | --------------------------------------------------------------------------------