├── .dockerignore
├── docs
    ├── changelog_link.md
    ├── favicon.ico
    ├── cli.rst
    ├── intro
    │   ├── linked_data.rst
    │   ├── git.rst
    │   ├── quickstart.rst
    │   ├── usage_python.rst
    │   └── tokens.rst
    ├── Makefile
    ├── make.bat
    ├── index.rst
    ├── conf.py
    ├── logo_notext.svg
    └── logo.svg
├── gimie
    ├── parsers
    │   ├── license
    │   │   ├── data
    │   │   │   ├── tfidf_matrix.npz
    │   │   │   ├── spdx_licenses.csv
    │   │   │   └── tfidf_vectorizer.json
    │   │   └── __init__.py
    │   ├── abstract.py
    │   ├── __init__.py
    │   └── cff.py
    ├── graph
    │   ├── __init__.py
    │   ├── namespaces.py
    │   └── operations.py
    ├── __init__.py
    ├── extractors
    │   ├── abstract.py
    │   ├── common
    │   │   └── queries.py
    │   ├── __init__.py
    │   ├── git.py
    │   ├── gitlab.py
    │   └── github.py
    ├── project.py
    ├── io.py
    ├── utils
    │   ├── uri.py
    │   └── text_processing.py
    ├── cli.py
    └── models.py
├── tests
    ├── conftest.py
    ├── test_project.py
    ├── test_cli.py
    ├── test_github.py
    ├── test_parsers.py
    ├── test_gitlab.py
    ├── test_output.py
    ├── test_tfidf.py
    ├── test_git.py
    └── test_cff.py
├── NOTICE
├── .env.dist
├── .docker
    ├── entrypoint.sh
    └── Dockerfile
├── .pre-commit-config.yaml
├── .github
    └── workflows
    │   ├── conventional-prs.yml
    │   ├── sphinx-docs.yml
    │   ├── poetry-pytest.yml
    │   ├── poetry-publish.yml
    │   ├── poetry-test-publish.yml
    │   └── docker-publish.yml
├── Makefile
├── CITATION.cff
├── scripts
    └── generate_tfidf.py
├── .gitignore
├── pyproject.toml
├── CHANGELOG.md
├── README.md
└── LICENSE


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | *__pycache__*
3 | 


--------------------------------------------------------------------------------
/docs/changelog_link.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CHANGELOG.md
2 | ```
3 | 


--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdsc-ordes/gimie/HEAD/docs/favicon.ico


--------------------------------------------------------------------------------
/gimie/parsers/license/data/tfidf_matrix.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdsc-ordes/gimie/HEAD/gimie/parsers/license/data/tfidf_matrix.npz


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
1 | Command Line Interface
2 | **********************
3 | 
4 | .. click:: gimie.cli:cli
5 |    :prog: gimie
6 |    :nested: full
7 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Avoid _pytest.pathlib.ImportPathMismatchError for pytest"""
2 | import os
3 | 
4 | os.environ["PY_IGNORE_IMPORTMISMATCH"] = "1"
5 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Gimie
2 | Copyright 2022 - Swiss Data Science Center (SDSC)
3 | A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | 


--------------------------------------------------------------------------------
/.env.dist:
--------------------------------------------------------------------------------
1 | # create your personal github token and add it here
2 | # see [here](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
3 | GITHUB_TOKEN=
4 | GITLAB_TOKEN=
5 | 


--------------------------------------------------------------------------------
/.docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | # If the first argument starts with "-" or if it is not recognized as a command,
 6 | # use "gimie" as command
 7 | if [ -z "${1##-*}" ] || [ -z "$(command -v $1)" ] ; then
 8 | 	set -- gimie "$@"
 9 | fi
10 | 
11 | exec "$@"
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     -   id: check-yaml
 6 |     -   id: end-of-file-fixer
 7 |     -   id: trailing-whitespace
 8 | -   repo: https://github.com/psf/black
 9 |     rev: 22.10.0
10 |     hooks:
11 |     -   id: black
12 | 


--------------------------------------------------------------------------------
/tests/test_project.py:
--------------------------------------------------------------------------------
 1 | """Test the project module."""
 2 | import pytest
 3 | 
 4 | from gimie.extractors import GIT_PROVIDERS
 5 | from gimie.project import get_extractor
 6 | 
 7 | 
 8 | def test_get_extractor():
 9 |     repo = "https://example.org/group/project"
10 |     for prov, extractor in GIT_PROVIDERS.items():
11 |         assert type(get_extractor(repo, prov)) == extractor
12 | 
13 |     with pytest.raises(ValueError):
14 |         get_extractor(repo, "bad_provider")
15 | 


--------------------------------------------------------------------------------
/docs/intro/linked_data.rst:
--------------------------------------------------------------------------------
1 | Linked data
2 | ***********
3 | 
4 | The aim of gimie is to extract project metadata in an interoperable format. This is achieved by generating `linked data <https://en.wikipedia.org/wiki/Linked_data>`_ following the widely used `schema.org <http://schema.org>`_ ontology. The resulting metadata can readily be augmented or integrated with other data sources.
5 | 
6 | Gimie's output follows recommendations provided by the `codemeta project <https://codemeta.github.io/>`_ , but also provides additional properties.
7 | 


--------------------------------------------------------------------------------
/.github/workflows/conventional-prs.yml:
--------------------------------------------------------------------------------
 1 | name: Conventional PR title
 2 | on:
 3 |   pull_request_target:
 4 |     types:
 5 |       - opened
 6 |       - reopened
 7 |       - edited
 8 |       - synchronize
 9 | 
10 | jobs:
11 |   title-format:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       # https://github.com/amannn/action-semantic-pull-request
15 |       - name: PR title format check
16 |         uses: amannn/action-semantic-pull-request@v5.3.0
17 |         continue-on-error: true
18 |         env:
19 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 |         with:
21 |           validateSingleCommit: true
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | """Tests for the Gimie command line interface."""
 2 | 
 3 | from gimie import cli
 4 | from typer.testing import CliRunner
 5 | 
 6 | runner = CliRunner()
 7 | 
 8 | 
 9 | def test_data_help():
10 |     """Checks if the 'gimie data --help' command exits successfully."""
11 |     result = runner.invoke(cli.app, ["data", "--help"])
12 |     assert result.exit_code == 0
13 | 
14 | 
15 | def test_advice_help():
16 |     """Checks if the 'gimie advice --help' command exits successfully."""
17 |     result = runner.invoke(cli.app, ["advice", "--help"])
18 |     assert result.exit_code == 0
19 | 
20 | 
21 | def test_parsers_help():
22 |     """Checks if the 'gimie parsers --help' command exits successfully."""
23 |     result = runner.invoke(cli.app, ["parsers", "--help"])
24 |     assert result.exit_code == 0
25 | 


--------------------------------------------------------------------------------
/docs/intro/git.rst:
--------------------------------------------------------------------------------
1 | Git repositories
2 | ****************
3 | 
4 | Software projects are usually version-controlled and hosted on a server. Git is by far the most popular version control system, and is commonly used for scientific software and data science projects.
5 | 
6 | Git natively stores some metadata about the project authors and contributions in a local index, but git providers (servers) such has Github and GitLab store and expose more advanced information about the project and contributors. These information are served in provider-dependent format with specific APIs.
7 | 
8 | Gimie aims to provide provider-agnostic metadata in an interoperable format. It will request data from the provider API if available, or from git by cloning the repository into a temporary folder otherwise. This metadata is then converted to the widely used schema.org standard so that it can readily be integrated with other tools and services.
9 | 


--------------------------------------------------------------------------------
/gimie/graph/__init__.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | from typing import Tuple, Union
18 | 
19 | from rdflib.term import Literal, URIRef
20 | 
21 | Property = Tuple[URIRef, Union[URIRef, Literal]]
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/tests/test_github.py:
--------------------------------------------------------------------------------
 1 | # Tests fetching metadata from GitHub repositories with different setups.
 2 | import pytest
 3 | 
 4 | from gimie.extractors.github import GithubExtractor
 5 | from gimie.io import RemoteResource
 6 | 
 7 | 
 8 | TEST_REPOS = [
 9 |     "https://github.com/sdsc-ordes/gimie",  # Owned by organization, has releases
10 |     "https://github.com/apache/openoffice",  # Owned by organization, no releases
11 |     "https://github.com/ishepard/pydriller",  # Owned by user, has releases
12 |     "https://github.com/rmfranken/license_test",  # Contains 2 license files
13 | ]
14 | 
15 | 
16 | @pytest.mark.parametrize("repo", TEST_REPOS)
17 | def test_github_extract(repo):
18 |     meta = GithubExtractor(repo).extract()
19 |     meta.serialize(format="ttl")
20 | 
21 | 
22 | @pytest.mark.parametrize("repo", TEST_REPOS)
23 | def test_github_list_files(repo):
24 |     files = GithubExtractor(repo).list_files()
25 |     assert all(isinstance(f, RemoteResource) for f in files)
26 | 


--------------------------------------------------------------------------------
/tests/test_parsers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gimie.io import LocalResource
 4 | from gimie.parsers import get_parser, list_parsers, parse_files
 5 | from rdflib import URIRef
 6 | from rdflib import Graph, URIRef, Literal
 7 | 
 8 | 
 9 | def test_get_parser():
10 |     # All parsers are available
11 |     for name in list_parsers():
12 |         get_parser(name)
13 | 
14 | 
15 | def test_get_bad_parser():
16 |     # Should raise error if parser not found
17 |     with pytest.raises(ValueError):
18 |         get_parser("bad_parser")
19 | 
20 | 
21 | def test_parse_license():
22 |     license_file = LocalResource("LICENSE")
23 |     graph = parse_files(
24 |         subject=URIRef("https://exmaple.org/"), files=[license_file]
25 |     )
26 |     assert "https://spdx.org" in graph.serialize(format="ttl")
27 | 
28 | 
29 | def test_parse_nothing():
30 |     folder = LocalResource("tests")
31 |     graph = parse_files(subject=URIRef("https://example.org/"), files=[folder])
32 |     assert len(graph) == 0
33 | 


--------------------------------------------------------------------------------
/tests/test_gitlab.py:
--------------------------------------------------------------------------------
 1 | from gimie.io import RemoteResource
 2 | from gimie.extractors.gitlab import GitlabExtractor
 3 | import pytest
 4 | 
 5 | TEST_REPOS = [
 6 |     "https://gitlab.com/inkscape/inkscape",  # Owned by multiple persons, has releases
 7 |     "https://gitlab.com/openrgb-pvazny/OpenRGB",  # No user owner so group owner, no releases
 8 |     "https://gitlab.com/gitlab-org/gitlab-runner",  # No user owner so group owner, has releases
 9 |     "https://gitlab.com/commonground/haven/haven",  # Nested groups
10 |     "https://gitlab.com/edouardklein/falsisign",  # owned by user
11 |     "https://gitlab.com/rmfranken/test-licenses",  # Contains 2 license files
12 | ]
13 | 
14 | 
15 | @pytest.mark.parametrize("repo", TEST_REPOS)
16 | def test_gitlab_extract(repo):
17 |     extractor = GitlabExtractor(repo)
18 |     meta = extractor.extract()
19 |     meta.serialize(format="ttl")
20 | 
21 | 
22 | @pytest.mark.parametrize("repo", TEST_REPOS)
23 | def test_gitlab_list_files(repo):
24 |     files = GitlabExtractor(repo).list_files()
25 |     assert all(isinstance(f, RemoteResource) for f in files)
26 | 


--------------------------------------------------------------------------------
/gimie/graph/namespaces.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | from rdflib.namespace import Namespace
18 | 
19 | SDO = Namespace("http://schema.org/")
20 | COD = Namespace("https://doi.org/10.5063/schema/codemeta-2.0/")
21 | SD = Namespace("https://w3id.org/okn/o/sd/1.9.0/")
22 | BIO = Namespace("https://bioschemas.org/")
23 | GIMIE = Namespace("https://sdsc-ordes.github.io/gimie/")
24 | MD4I = Namespace("http://w3id.org/nfdi4ing/metadata4ing#")
25 | 


--------------------------------------------------------------------------------
/gimie/__init__.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | import logging
19 | 
20 | import importlib.metadata as importlib_metadata
21 | 
22 | __version__ = importlib_metadata.version(__name__)
23 | 
24 | logger = logging.getLogger()
25 | stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s")
26 | stream_handler = logging.StreamHandler()
27 | stream_handler.setLevel(logging.WARNING)
28 | stream_handler.setFormatter(stdout_formatter)
29 | logger.addHandler(stream_handler)
30 | 


--------------------------------------------------------------------------------
/tests/test_output.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Test the gimie output"""
18 | import pytest
19 | from rdflib import Graph
20 | 
21 | from gimie.project import Project
22 | 
23 | 
24 | OUT_TTL = (
25 |     Project("https://github.com/sdsc-ordes/gimie", git_provider="github")
26 |     .extract()
27 |     .serialize(format="ttl")
28 | )
29 | 
30 | 
31 | def test_validate_output_is_linked_data():
32 |     """Is output valid RDF?"""
33 |     g = Graph().parse(format="ttl", data=OUT_TTL)
34 |     assert g is not None
35 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. gimie documentation master file, created by
 2 |    sphinx-quickstart on Tue Jun  6 16:50:55 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. image:: logo.svg
 7 |    :width: 200
 8 |    :alt: gimie logo
 9 | 
10 | 
11 | Welcome to gimie's documentation!
12 | =================================
13 | gimie (Git Meta Information Extractor) is a python library and command line tool to extract structured metadata from git repositories.
14 | 
15 | .. card:: :octicon:`mark-github;2em` `GitHub repository <https://github.com/sdsc-ordes/gimie>`_
16 | 
17 |    Visit gimie's GitHub repository to follow the latest developments!
18 | 
19 | 
20 | .. toctree::
21 |    :maxdepth: 1
22 |    :caption: Background
23 | 
24 |    Linked data - What is it and why do we use it? <intro/linked_data>
25 |    Git repositories  - Where code lives <intro/git>
26 |    Access tokens - Authenticate gimie on your behalf <intro/tokens>
27 | 
28 | .. toctree::
29 |    :maxdepth: 1
30 |    :caption: Documentation
31 | 
32 |    intro/quickstart
33 |    intro/usage_python
34 |    API Documentation <api/modules>
35 |    CLI Documentation <cli>
36 | 
37 | .. toctree:: changelog_link
38 |    :maxdepth: 1
39 |    :caption: Changelog
40 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: install
 2 | install: ## Install with the poetry and add pre-commit hooks
 3 | 	@echo "🚀 Installing packages with poetry"
 4 | 	@poetry install
 5 | 	@poetry run pre-commit install
 6 | 
 7 | .PHONY: check
 8 | check: ## Run code quality tools.
 9 | 	@echo "🚀 Checking Poetry lock file consistency with 'pyproject.toml': Running poetry lock --check"
10 | 	@poetry lock --check
11 | 	@echo "🚀 Linting code: Running pre-commit"
12 | 	@poetry run pre-commit run -a
13 | 
14 | .PHONY: doc
15 | doc: ## Build sphinx documentation website locally
16 | 	@echo "📖 Building documentation"
17 | 	@cd docs
18 | 	@poetry run sphinx-apidoc -d 3 -f -o docs/api gimie
19 | 	@poetry run sphinx-build docs/ docs/_build
20 | 
21 | .PHONY: docker-build
22 | docker-build: ## Build the gimie Docker image
23 | 	@echo "🐋 Building docker image"
24 | 	@docker build -t gimie -f .docker/Dockerfile .
25 | 
26 | .PHONY: test
27 | test: ## Test the code with pytest
28 | 	@echo "🚀 Testing code: Running pytest"
29 | 	@poetry run pytest
30 | 
31 | .PHONY: changelog
32 | changelog: ## Generate the changelog
33 | 	@git-cliff -l -c pyproject.toml || echo "git-cliff must be installed"
34 | 
35 | .PHONY: help
36 | help:
37 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
38 | 
39 | .DEFAULT_GOAL := help
40 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: gimie
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | authors:
11 |   - given-names: Cyril
12 |     family-names: Matthey-Doret
13 |     affiliation: Swiss Data Science Center
14 |     orcid: 'https://orcid.org/0000-0002-1126-1535'
15 |   - given-names: Sabine
16 |     family-names: Maennel
17 |     orcid: 'https://orcid.org/0009-0001-3022-8239'
18 |     affiliation: Swiss Data Science Center
19 |   - given-names: Robin
20 |     family-names: Franken
21 |     orcid: 'https://orcid.org/0009-0008-0143-9118'
22 |     affiliation: Swiss Data Science Center
23 |   - given-names: Martin
24 |     family-names: Fontanet
25 |     orcid: 'https://orcid.org/0000-0002-6441-8540'
26 |     affiliation: Swiss Data Science Center
27 |   - given-names: Laure
28 |     family-names: Vancauwenberghe
29 |     affiliation: Swiss Data Science Center
30 |   - given-names: Stefan
31 |     family-names: Milosavljevic
32 |     email: supermegaiperste@hotmail.com
33 |     affiliation: Swiss Data Science Center
34 | repository-code: 'https://github.com/sdsc-ordes/gimie'
35 | abstract: Extract linked metadata from repositories
36 | keywords:
37 |   - git
38 |   - cli
39 |   - library
40 |   - linked-open-data
41 |   - metadata-extraction
42 |   - fair-data
43 |   - scientific-software
44 | license: Apache-2.0
45 | 


--------------------------------------------------------------------------------
/gimie/graph/operations.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Operations on graphs."""
18 | from functools import reduce
19 | from typing import Set
20 | 
21 | from rdflib import Graph
22 | from rdflib.term import URIRef
23 | 
24 | from gimie.graph import Property
25 | 
26 | 
27 | def combine_graphs(*graphs: Graph) -> Graph:
28 |     """Combines an arbitrary number of input graphs
29 |     into a single graph."""
30 |     return reduce(lambda g1, g2: g1 | g2, graphs)
31 | 
32 | 
33 | def properties_to_graph(uri: URIRef, properties: Set[Property]) -> Graph:
34 |     """Attaches a set of predicate-object tuples to input
35 |     URI to produce an RDF graph."""
36 |     g = Graph()
37 |     for pred, obj in properties:
38 |         g.add((uri, pred, obj))
39 |     return g
40 | 


--------------------------------------------------------------------------------
/docs/intro/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quick start
 2 | ***********
 3 | 
 4 | The easiest way to use gimie is to run it as a command line tool. Here's how to get started:
 5 | 
 6 | Install using pip or docker:
 7 | 
 8 | .. tab-set::
 9 | 
10 |     .. tab-item:: pip
11 |         :sync: pip
12 |         :selected:
13 | 
14 |         .. code-block:: console
15 | 
16 |             pip install gimie
17 | 
18 |     .. tab-item:: docker
19 |         :sync: docker
20 | 
21 |         .. code-block:: console
22 | 
23 |             docker pull ghcr.io/sdsc-ordes/gimie:latest
24 | 
25 | 
26 | .. warning::
27 | 
28 |     Before running gimie, you will need to obtain a personal access token for the GitHub and/or GitLab and export it as an environment variable. See :ref:`Token management` for more information.
29 | 
30 | 
31 | Gimie can then be used as follows to extract repository metadata:
32 | 
33 | .. tab-set::
34 | 
35 |     .. tab-item:: pip
36 |         :sync: pip
37 |         :selected:
38 | 
39 |         .. code-block:: console
40 |             :emphasize-text: <repository-url>
41 | 
42 |             gimie data <repository-url> > output.ttl
43 | 
44 |     .. tab-item:: docker
45 |         :sync: docker
46 | 
47 |         .. code-block:: console
48 |             :emphasize-text: <repository-url>
49 | 
50 |             docker run -e GITHUB_TOKEN=${GITHUB_TOKEN} ghcr.io/sdsc-ordes/gimie:latest data <repository-url> > output.ttl
51 | 
52 | 
53 | .. note::
54 | 
55 |     When running gimie in a container, you need to pass your github or gitlab token as an environment variable inside the container:
56 | 


--------------------------------------------------------------------------------
/.docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG VERSION_BUILD
 2 | 
 3 | FROM python:3.12-slim-bookworm AS python
 4 | ENV PYTHONUNBUFFERED=true
 5 | WORKDIR /app
 6 | 
 7 | LABEL org.opencontainers.image.source=https://github.com/sdsc-ordes/gimie
 8 | LABEL org.opencontainers.image.description="Extract linked metadata from repositories."
 9 | LABEL org.opencontainers.image.licenses=Apache-2.0
10 | LABEL org.opencontainers.image.version=${VERSION_BUILD}
11 | 
12 | ##################################################
13 | # Poetry setup
14 | ##################################################
15 | FROM python AS poetry
16 | 
17 | # Install poetry
18 | ENV POETRY_HOME=/opt/poetry
19 | ENV POETRY_VIRTUALENVS_IN_PROJECT=true
20 | ENV PATH="$POETRY_HOME/bin:$PATH"
21 | RUN python -c 'from urllib.request import urlopen; print(urlopen("https://install.python-poetry.org").read().decode())' | python -
22 | 
23 | # Copy necessary files only
24 | COPY gimie ./gimie
25 | COPY pyproject.toml ./pyproject.toml
26 | COPY poetry.lock ./poetry.lock
27 | COPY .env.dist ./.env.dist
28 | COPY README.md ./README.md
29 | RUN apt-get update && \
30 |     apt-get install -y gcc
31 | 
32 | # Poetry install
33 | RUN poetry install --no-interaction --no-ansi -vvv
34 | 
35 | 
36 | ##################################################
37 | # Gimie setup
38 | ##################################################
39 | FROM python AS runtime
40 | ENV PATH="/app/.venv/bin:$PATH"
41 | RUN apt-get update && \
42 |     apt-get install -y git libgomp1 libmagic-dev
43 | COPY --from=poetry /app /app
44 | COPY ".docker/entrypoint.sh" "/entrypoint.sh"
45 | 
46 | # Set user
47 | RUN useradd -ms /bin/bash gimie_user
48 | USER gimie_user
49 | 
50 | # Test gimie
51 | RUN gimie --version
52 | 
53 | # Set command and entrypoint
54 | CMD ["gimie"]
55 | ENTRYPOINT ["/entrypoint.sh"]
56 | 


--------------------------------------------------------------------------------
/.github/workflows/sphinx-docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 |   pull_request:
 6 |     paths:
 7 |       - 'docs/**'
 8 | 
 9 | permissions:
10 |     contents: write
11 | jobs:
12 |   docs-build:
13 |     runs-on: ubuntu-latest
14 |     if: github.ref != 'refs/heads/main'
15 |     steps:
16 |       # https://github.com/actions/checkout
17 |       - uses: actions/checkout@v4
18 | 
19 |       # https://github.com/actions/setup-python
20 |       - uses: actions/setup-python@v4
21 | 
22 |       # https://github.com/snok/install-poetry
23 |       - name: Install Poetry
24 |         uses: snok/install-poetry@v1
25 | 
26 |       - name: Install dependencies
27 |         run: |
28 |           poetry install --with doc
29 | 
30 |       - name: Sphinx build
31 |         run: |
32 |           make doc
33 | 
34 |   docs-push:
35 |     runs-on: ubuntu-latest
36 |     if: github.ref == 'refs/heads/main'
37 |     steps:
38 |       # https://github.com/actions/checkout
39 |       - uses: actions/checkout@v4
40 | 
41 |       # https://github.com/actions/setup-python
42 |       - uses: actions/setup-python@v4
43 | 
44 |       # https://github.com/snok/install-poetry
45 |       - name: Install Poetry
46 |         uses: snok/install-poetry@v1
47 | 
48 |       - name: Install dependencies
49 |         run: |
50 |           poetry install --with doc
51 | 
52 |       - name: Sphinx build
53 |         run: |
54 |           make doc
55 | 
56 |       # https://github.com/peaceiris/actions-gh-pages
57 |       - name: Deploy
58 |         uses: peaceiris/actions-gh-pages@v3
59 |         # if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/docs-website' }}
60 |         with:
61 |           publish_branch: gh-pages
62 |           github_token: ${{ secrets.GITHUB_TOKEN }}
63 |           publish_dir: docs/_build/
64 |           force_orphan: true
65 | 


--------------------------------------------------------------------------------
/.github/workflows/poetry-pytest.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on: [push, workflow_call]
 4 | 
 5 | jobs:
 6 | 
 7 |   test:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
12 |     steps:
13 |       # https://github.com/actions/checkout
14 |       - uses: actions/checkout@v4
15 |         with:
16 |           fetch-depth: 0
17 | 
18 |       # https://github.com/actions/setup-python
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       # https://github.com/snok/install-poetry
25 |       - name: Install Poetry
26 |         uses: snok/install-poetry@v1
27 | 
28 |       - name: Install Dependencies
29 |         run: poetry install
30 |         if: steps.cache.outputs.cache-hit != 'true'
31 | 
32 |       - name: Code Quality
33 |         run: poetry run black . --check
34 | 
35 |       - name: Test with pytest
36 |         env:
37 |           GITHUB_TOKEN: ${{ secrets.ACCESS_GITHUB_TOKEN }}
38 |           GITLAB_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
39 |         run: make test
40 | 
41 |       - name: Upload coverage report
42 |         env:
43 |           COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
44 |           GITHUB_TOKEN: ${{ secrets.ACCESS_GITHUB_TOKEN }}
45 |           GITLAB_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
46 |           COVERALLS_PARALLEL: true
47 |         run: |
48 |           pip install coveralls
49 |           coveralls --service=github-actions
50 |         continue-on-error: true
51 | 
52 |   finish:
53 |     needs: test
54 |     if: ${{ always() }}
55 |     runs-on: ubuntu-latest
56 |     steps:
57 |     # https://github.com/coverallsapp/github-action
58 |     - name: Coveralls Finished
59 |       uses: coverallsapp/github-action@v2
60 |       with:
61 |         parallel-finished: true
62 | 


--------------------------------------------------------------------------------
/gimie/parsers/abstract.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | from abc import ABC, abstractmethod
18 | from functools import reduce
19 | from typing import Iterable, Set
20 | from rdflib import Graph, URIRef
21 | from gimie.graph import Property
22 | 
23 | 
24 | class Parser(ABC):
25 |     """
26 |     Parser is an Abstract Base Class. It is only meant
27 |     to define a standard interface for all parsers.
28 | 
29 |     All subclasses must implement parse(). A parser parses
30 |     bytes data into a set of predicate-object tuples.
31 | 
32 |     Parameters
33 |     ----------
34 |     subject:
35 |         The subject of a triple (subject - predicate - object) to be used for writing parsed properties to.
36 |     """
37 | 
38 |     def __init__(self, subject: str):
39 |         self.subject = URIRef(subject)
40 | 
41 |     @abstractmethod
42 |     def parse(self, data: bytes) -> Graph:
43 |         """Extract rdf graph from a source."""
44 |         ...
45 | 
46 |     def parse_all(self, docs: Iterable[bytes]) -> Graph:
47 |         """Parse multiple sources and return the union of
48 |         triples."""
49 | 
50 |         properties = map(self.parse, docs)
51 |         return reduce(lambda p1, p2: p1 | p2, properties)
52 | 


--------------------------------------------------------------------------------
/scripts/generate_tfidf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Download all SPDX licenses and fit a tf-idf vectorizer to them.
 3 | The tf-idf matrix, vectorizer and license list are then saved to disk."""
 4 | 
 5 | import json
 6 | from pathlib import Path
 7 | from typing import List, NamedTuple
 8 | 
 9 | import numpy as np
10 | import scipy.sparse as sp
11 | import requests
12 | 
13 | from gimie.utils.text import TfidfConfig, TfidfVectorizer
14 | 
15 | OUT_DIR = Path("gimie") / "parsers" / "license" / "data"
16 | 
17 | # Retrieve metadata for all OSI approved and valid licenses from SPDX
18 | SPDX_LIST_URL = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
19 | all_licenses = requests.get(SPDX_LIST_URL).json()["licenses"]
20 | licenses = filter(lambda l: l["isOsiApproved"], all_licenses)
21 | licenses = filter(lambda l: not l["isDeprecatedLicenseId"], licenses)
22 | licenses = list(licenses)
23 | 
24 | # Assemble corpus of license texts (this takes a while)
25 | class License(NamedTuple):
26 |     license_id: str
27 |     text: str
28 | 
29 | 
30 | corpus: List[License] = []
31 | 
32 | for idx, license in enumerate(licenses):
33 |     resp = requests.get(license["detailsUrl"])
34 |     if not resp.ok:
35 |         continue
36 |     text = resp.json()["licenseText"]
37 |     corpus.append(License(license["licenseId"], text))
38 | 
39 | # Fit tfidf vectorizer to corpus
40 | texts = [l.text for l in corpus]
41 | vectorizer = TfidfVectorizer(
42 |     config=TfidfConfig(
43 |         max_features=700, ngram_range=(1, 2), sublinear_tf=True, norm="l2"
44 |     )
45 | )
46 | tfidf = vectorizer.fit_transform(texts)
47 | 
48 | # Save vectorizer and tfidf matrix
49 | with open(OUT_DIR / "tfidf_vectorizer.json", "w") as fp:
50 |     fp.write(vectorizer.model_dump_json())
51 | # Prune precision to reduce size
52 | tfidf.data = tfidf.data.astype(np.float16)
53 | sp.save_npz(OUT_DIR / "tfidf_matrix.npz", tfidf)
54 | with open(OUT_DIR / "spdx_licenses.csv", "w") as fp:
55 |     for l in corpus:
56 |         fp.write(f"{l.license_id},{len(l.text)}\n")
57 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "gimie"
10 | copyright = "2023, sdsc-ordes"
11 | author = "sdsc-ordes"
12 | release = "0.7.2"
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | # Add any Sphinx extension module names here, as strings. They can be
18 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
19 | # ones.
20 | extensions = [
21 |     "sphinx.ext.napoleon",
22 |     "sphinx.ext.autodoc",
23 |     "sphinx.ext.doctest",
24 |     "sphinx.ext.intersphinx",
25 |     "sphinx.ext.coverage",
26 |     "sphinx.ext.viewcode",
27 |     "sphinx.ext.githubpages",
28 |     "sphinx.ext.autosectionlabel",
29 |     "sphinx_click",
30 |     "sphinx_copybutton",
31 |     "sphinx_design",
32 |     "myst_parser",
33 |     "sphinxawesome_theme.highlighting",
34 | ]
35 | 
36 | templates_path = ["_templates"]
37 | 
38 | source_suffix = {
39 |     ".rst": "restructuredtext",
40 |     ".md": "markdown",
41 | }
42 | 
43 | 
44 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
45 | 
46 | 
47 | # -- Options for HTML output -------------------------------------------------
48 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
49 | 
50 | html_theme = "sphinxawesome_theme"
51 | html_static_path = ["_static"]
52 | html_logo = "logo_notext.svg"
53 | html_favicon = "favicon.ico"
54 | 
55 | 
56 | # -- Extension configuration -------------------------------------------------
57 | 
58 | # Options for intersphinx
59 | 
60 | intersphinx_mapping = {
61 |     "python": ("https://docs.python.org/", None),
62 |     "rdflib": ("https://rdflib.readthedocs.io/en/stable/", None),
63 |     "calamus": ("https://calamus.readthedocs.io/en/latest/", None),
64 | }
65 | 


--------------------------------------------------------------------------------
/.github/workflows/poetry-publish.yml:
--------------------------------------------------------------------------------
 1 | # Workflow following resources at:
 2 | #  - https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-pypi
 3 | #  - https://packaging.python.org/en/latest/tutorials/packaging-projects/#uploading-the-distribution-archives
 4 | # Jobs are split to prevent unneccessary priviledge elevation through write permissions during building.
 5 | 
 6 | name: Build and publish on Pypi
 7 | 
 8 | on:
 9 |   release:
10 |     types: [published]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   run-tests:
17 |     uses: ./.github/workflows/poetry-pytest.yml
18 |     secrets: inherit
19 | 
20 |   release-build:
21 |     name: Build python wheels
22 |     needs:
23 |       - run-tests
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       # https://github.com/actions/checkout
27 |       - uses: actions/checkout@v4
28 |       - name: Set up Python
29 |       # https://github.com/actions/setup-python
30 |         uses: actions/setup-python@v5.1.1
31 |         with:
32 |           python-version: "3.12"
33 | 
34 |       - name: Install Poetry
35 |         run: |
36 |           pip install poetry
37 | 
38 |       - name: Build source and wheel archives
39 |         run: poetry build
40 | 
41 |       - name: Upload distributions
42 |       # https://github.com/actions/upload-artifact
43 |         uses: actions/upload-artifact@v4
44 |         with:
45 |           name: release-dists
46 |           path: dist/
47 | 
48 |   pypi-publish:
49 |     name: Upload release to PyPI
50 |     needs:
51 |       - release-build
52 |     runs-on: ubuntu-latest
53 |     environment:
54 |       name: pypi
55 |       url: https://pypi.org/p/gimie
56 |     permissions:
57 |       id-token: write
58 |       # IMPORTANT: this permission is mandatory for trusted publishing
59 |     steps:
60 |     - name: Retrieve release distributions
61 |     # https://github.com/actions/download-artifact
62 |       uses: actions/download-artifact@v4.1.8
63 |       with:
64 |         name: release-dists
65 |         path: dist/
66 |     - name: Publish package distributions to PyPI
67 |     # https://github.com/pypa/gh-action-pypi-publish
68 |       uses: pypa/gh-action-pypi-publish@release/v1
69 | 


--------------------------------------------------------------------------------
/.github/workflows/poetry-test-publish.yml:
--------------------------------------------------------------------------------
 1 | # Workflow following resources at:
 2 | #  - https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-pypi
 3 | #  - https://packaging.python.org/en/latest/tutorials/packaging-projects/#uploading-the-distribution-archives
 4 | # Jobs are split to prevent unneccessary priviledge elevation through write permissions during building.
 5 | 
 6 | name: Build and publish on Pypi Test
 7 | 
 8 | on:
 9 |   workflow_dispatch:
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   run-tests:
16 |     uses: ./.github/workflows/poetry-pytest.yml
17 |     secrets: inherit
18 | 
19 |   test-build:
20 |     name: Build python wheels
21 |     needs:
22 |       - run-tests
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       # https://github.com/actions/checkout
26 |       - uses: actions/checkout@v4
27 |       - name: Set up Python
28 |       # https://github.com/actions/setup-python
29 |         uses: actions/setup-python@v5.1.1
30 |         with:
31 |           python-version: "3.12"
32 | 
33 |       - name: Install Poetry
34 |         run: |
35 |           pip install poetry
36 | 
37 |       - name: Build source and wheel archives
38 |         run: poetry build
39 | 
40 |       - name: Upload distributions
41 |       # https://github.com/actions/upload-artifact
42 |         uses: actions/upload-artifact@v4
43 |         with:
44 |           name: test-dists
45 |           path: dist/
46 | 
47 |   pypi-test-publish:
48 |     name: Upload release to PyPI Test
49 |     needs:
50 |       - test-build
51 |     runs-on: ubuntu-latest
52 |     environment:
53 |       name: test-pypi
54 |       url: https://test.pypi.org/p/gimie
55 |     permissions:
56 |       id-token: write
57 |       # IMPORTANT: this permission is mandatory for trusted publishing
58 |     steps:
59 |     - name: Retrieve release distributions
60 |     # https://github.com/actions/download-artifact
61 |       uses: actions/download-artifact@v4.1.8
62 |       with:
63 |         name: test-dists
64 |         path: dist/
65 |     - name: Publish package distributions to TestPyPI
66 |     # https://github.com/pypa/gh-action-pypi-publish
67 |       uses: pypa/gh-action-pypi-publish@release/v1
68 |       with:
69 |         repository-url: https://test.pypi.org/legacy/
70 | 


--------------------------------------------------------------------------------
/tests/test_tfidf.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List
 3 | 
 4 | import numpy as np
 5 | import pytest
 6 | 
 7 | from gimie.utils.text_processing import TfidfConfig, TfidfVectorizer
 8 | 
 9 | CORPUS = [
10 |     "This is my test document.",
11 |     "This is another test document.",
12 | ]
13 | 
14 | 
15 | @pytest.fixture
16 | def tfidf_vectorizer() -> TfidfVectorizer:
17 |     """Fixture for a TfidfVectorizer instance."""
18 |     config = TfidfConfig(norm="l2", sublinear_tf=True)
19 |     return TfidfVectorizer(config=config)
20 | 
21 | 
22 | def test_tfidf_serde(tfidf_vectorizer: TfidfVectorizer):
23 |     """Test json serialization and deserialization of TfidfVectorizer."""
24 |     json_str = tfidf_vectorizer.model_dump_json(indent=2)
25 |     json.loads(json_str)
26 |     print(TfidfVectorizer.model_validate_json(json_str))
27 | 
28 | 
29 | def test_tfidf_fit_transform(tfidf_vectorizer: TfidfVectorizer):
30 |     """Test correctness of tfidf fit."""
31 |     _ = tfidf_vectorizer.fit_transform(CORPUS)
32 |     # targets computed using sklearn 1.2.2
33 |     target_voc = {
34 |         "another": 0,
35 |         "document": 1,
36 |         "is": 2,
37 |         "my": 3,
38 |         "test": 4,
39 |         "this": 5,
40 |     }
41 |     target_idf = np.array(
42 |         [1.4054651081081644, 1.0, 1.0, 1.4054651081081644, 1.0, 1.0]
43 |     )
44 |     assert all(
45 |         [v == target_voc[t] for t, v in tfidf_vectorizer.vocabulary.items()]
46 |     )
47 |     pred_idf: List[float] = tfidf_vectorizer.idf_vector
48 |     assert all([pred == target for pred, target in zip(pred_idf, target_idf)])
49 | 
50 | 
51 | # Test fitting different configurations
52 | @pytest.mark.parametrize(
53 |     "config",
54 |     [
55 |         TfidfConfig(),
56 |         TfidfConfig(max_features=10),
57 |         TfidfConfig(ngram_range=(1, 2)),
58 |         TfidfConfig(ngram_range=(2, 2)),
59 |         TfidfConfig(smooth_idf=False),
60 |         TfidfConfig(norm="l1"),
61 |         TfidfConfig(norm="l2"),
62 |         TfidfConfig(sublinear_tf=True),
63 |         TfidfConfig(vocabulary={"this": 0, "is": 1, "test": 2}),
64 |     ],
65 | )
66 | def test_tfidf_configs(config):
67 |     """Test fitting different configurations."""
68 |     vectorizer = TfidfVectorizer(config=config)
69 |     _ = vectorizer.fit_transform(CORPUS)
70 | 


--------------------------------------------------------------------------------
/tests/test_git.py:
--------------------------------------------------------------------------------
 1 | """Tests for the Gimie command line interface."""
 2 | 
 3 | import os
 4 | import datetime
 5 | 
 6 | import pytest
 7 | 
 8 | from gimie.io import LocalResource
 9 | from gimie.extractors.git import GitExtractor
10 | from gimie.project import Project
11 | 
12 | LOCAL_REPOSITORY = os.getcwd()
13 | RENKU_GITHUB = "https://github.com/SwissDataScienceCenter/renku"
14 | UNSUPPORTED_PROV = "https://codeberg.org/dnkl/foot"
15 | 
16 | 
17 | @pytest.fixture
18 | def local_meta():
19 |     """Return metadata for a local repository."""
20 |     extractor = GitExtractor(
21 |         "https://github.com/sdsc-ordes/gimie", local_path=LOCAL_REPOSITORY
22 |     )
23 |     return extractor.extract()
24 | 
25 | 
26 | def test_git_authors(local_meta):
27 |     """Test part of the authors returned by gimie."""
28 |     contribs = [c.name for c in local_meta.contributors]
29 |     author = local_meta.authors[0]
30 |     names = [
31 |         "cmdoret",
32 |         "Martin Nathan Tristan Fontanet",
33 |         "rmfranken",
34 |         "sabrinaossey",
35 |     ]
36 |     assert all([n in contribs for n in names])
37 |     assert author.name == "Cyril Matthey-Doret"
38 | 
39 | 
40 | def test_git_creation_date(local_meta):
41 |     """Test the creation date of a git repository."""
42 |     assert local_meta.date_created.astimezone(
43 |         datetime.timezone.utc
44 |     ) == datetime.datetime(
45 |         2022, 12, 7, 10, 19, 31, tzinfo=datetime.timezone.utc
46 |     )
47 | 
48 | 
49 | def test_set_uri():
50 |     meta = GitExtractor(
51 |         "https://example.com/test", local_path=LOCAL_REPOSITORY
52 |     ).extract()
53 |     assert meta._id == "https://example.com/test"
54 | 
55 | 
56 | def test_clone_extract_github():
57 |     """Clone Git repository by setting git extractor
58 |     explicitely and extract metadata locally."""
59 |     proj = Project(RENKU_GITHUB, git_provider="git")
60 |     assert type(proj.extractor) == GitExtractor
61 |     proj.extract()
62 | 
63 | 
64 | def test_clone_unsupported():
65 |     """Instantiate Project from unsupported provider
66 |     with git as default provider"""
67 |     proj = Project(UNSUPPORTED_PROV)
68 |     assert type(proj.extractor) == GitExtractor
69 |     proj.extract()
70 | 
71 | 
72 | def test_git_list_files():
73 |     files = GitExtractor(UNSUPPORTED_PROV).list_files()
74 |     assert all(isinstance(f, LocalResource) for f in files)
75 | 


--------------------------------------------------------------------------------
/gimie/extractors/abstract.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Abstract for Git repository extractors."""
18 | from abc import ABC, abstractmethod
19 | from typing import List, Optional
20 | 
21 | from urllib.parse import urlparse
22 | 
23 | from gimie.io import Resource
24 | from gimie.models import Repository
25 | 
26 | 
27 | class Extractor(ABC):
28 |     """Extractor is an Abstract Base Class. It is only meant
29 |     to define a standard interface for all git repository extractors.
30 | 
31 |     Subclasses for different git providers must implement
32 |     extract() and list_files() methods.
33 |     """
34 | 
35 |     def __init__(
36 |         self,
37 |         url: str,
38 |         base_url: Optional[str] = None,
39 |         local_path: Optional[str] = None,
40 |     ):
41 |         self.url = url
42 |         self.base_url = base_url
43 |         self.local_path = local_path
44 | 
45 |     @abstractmethod
46 |     def extract(self) -> Repository:
47 |         """Extract metadata from the git provider into a Repository object."""
48 |         ...
49 | 
50 |     @abstractmethod
51 |     def list_files(self) -> List[Resource]:
52 |         """List all files in the repository HEAD."""
53 |         ...
54 | 
55 |     @property
56 |     def path(self) -> str:
57 |         """Path to the repository without the base URL."""
58 |         if self.base_url is None:
59 |             return urlparse(self.url).path.strip("/")
60 |         return self.url.removeprefix(self.base_url).strip("/")
61 | 
62 |     @property
63 |     def base(self) -> str:
64 |         """Base URL of the remote."""
65 |         if self.base_url is None:
66 |             url = urlparse(self.url)
67 |             return f"{url.scheme}://{url.netloc}"
68 |         return self.base_url
69 | 


--------------------------------------------------------------------------------
/tests/test_cff.py:
--------------------------------------------------------------------------------
 1 | from gimie.io import LocalResource
 2 | from gimie.parsers import CffParser
 3 | from gimie.parsers.cff import get_cff_authors
 4 | from rdflib import URIRef, Literal
 5 | import pytest
 6 | 
 7 | 
 8 | def test_parse_cff():
 9 |     cff_file = LocalResource("CITATION.cff")
10 |     with open(cff_file.path, "rb") as f:
11 |         cff_content = f.read()
12 |     authors = get_cff_authors(cff_content)
13 |     assert authors is not None
14 | 
15 | 
16 | @pytest.mark.parametrize(
17 |     "cff_file",
18 |     [
19 |         (
20 |             b"""
21 |     cff-version: 1.2.0
22 |     message: "This is a CFF devoid authors or DOI"
23 |     """
24 |         ),
25 |         (
26 |             b"""
27 |     cff-version: 1.2.0
28 |     title: gimie :
29 |     authors:
30 |       family-names: Doe
31 |         given-names: John
32 |         - family-names: Smith
33 |     given-names:
34 |     Jane
35 |         orcid: 0000-0001-2345-6789
36 |     """
37 |         ),
38 |         (
39 |             b"""
40 |     cff-version: 1.2.0
41 |     title: gimie
42 |     authors:
43 |       - family-names: Doe
44 |         given-names: John
45 |         orcid: 0000-0001-2345-6789
46 |       - family-names: Smith
47 |         given-names: Jane
48 |         orcid: http://www.orcid.org/0000-0001-2345-6789
49 |     """
50 |         ),
51 |         (
52 |             b"""
53 |     cff-version: 1.2.0
54 |     title: gimie
55 |     authors:
56 |       - family-names: Doe
57 |         given-names: John
58 |     """
59 |         ),
60 |     ],
61 | )
62 | def test_broken_cff(cff_file):
63 |     assert (
64 |         len(
65 |             CffParser(subject=URIRef("https://example.org/")).parse(
66 |                 data=cff_file
67 |             )
68 |         )
69 |         == 0
70 |     )
71 | 
72 | 
73 | def test_parse_doi():
74 |     cff_file = b"""
75 |     cff-version: 1.2.0
76 |     message: If you use this software, please cite it using these metadata.
77 |     title: 'napari: a multi-dimensional image viewer for Python'
78 |     identifiers:
79 |     - type: doi
80 |       value: 10.5281/zenodo.3555620
81 |     - type: doi
82 |       value: 10.21105/joss.01274
83 |     """
84 |     parsed_dois = list(
85 |         CffParser(subject=URIRef("https://example.org/"))
86 |         .parse(data=cff_file)
87 |         .objects()
88 |     )
89 |     expected_dois = [
90 |         URIRef("https://doi.org/10.5281/zenodo.3555620"),
91 |         URIRef("https://doi.org/10.21105/joss.01274"),
92 |     ]
93 |     # parsed_dois already contains all parsed DOI objects
94 |     for doi in expected_dois:
95 |         assert doi in parsed_dois
96 | 


--------------------------------------------------------------------------------
/gimie/extractors/common/queries.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import requests
18 | from typing import Any, Dict, List, Union
19 | 
20 | 
21 | def send_rest_query(
22 |     api: str, query: str, headers: Dict[str, str]
23 | ) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
24 |     """Generic function to send a query to the GitHub/GitLab rest API."""
25 |     resp = requests.get(
26 |         url=f"{api}/{query}",
27 |         headers=headers,
28 |     )
29 | 
30 |     if resp.status_code != 200:
31 |         error_msg = resp.json().get("message", "")
32 |         if "API rate limit exceeded" in error_msg:
33 |             raise ConnectionError(
34 |                 "Authentication failed: API rate limit exceeded. Please check that you have added "
35 |                 "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables."
36 |             )
37 |         raise ConnectionError(f"API request failed: {error_msg}")
38 |     return resp.json()
39 | 
40 | 
41 | def send_graphql_query(
42 |     api: str, query: str, data: Dict[str, Any], headers: Dict[str, str]
43 | ) -> Dict[str, Any]:
44 |     """Generic function to send a GraphQL query to the GitHub/GitLab API."""
45 |     resp = requests.post(
46 |         url=f"{api}/graphql",
47 |         json={
48 |             "query": query,
49 |             "variables": data,
50 |         },
51 |         headers=headers,
52 |     )
53 | 
54 |     if resp.status_code != 200:
55 |         error_msg = resp.json().get("message", "")
56 |         if "API rate limit exceeded" in error_msg:
57 |             raise ConnectionError(
58 |                 "Authentication failed: API rate limit exceeded. Please check that you have added "
59 |                 "your GITHUB_TOKEN or GITLAB_TOKEN to your environment variables."
60 |             )
61 |         raise ConnectionError(f"API request failed: {error_msg}")
62 |     return resp.json()
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # apidoc generated docs
  2 | docs/api
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .DS_Store
108 | **/.DS_Store
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # PyCharm
136 | .idea/
137 | 
138 | # Vscode
139 | .vscode/
140 | .devcontainer.json
141 | .devcontainer/
142 | 


--------------------------------------------------------------------------------
/gimie/parsers/license/data/spdx_licenses.csv:
--------------------------------------------------------------------------------
  1 | 0BSD,643
  2 | AAL,2529
  3 | AFL-1.1,4676
  4 | AFL-1.2,4950
  5 | AFL-2.0,8986
  6 | AFL-2.1,8947
  7 | AFL-3.0,10315
  8 | AGPL-3.0-only,34020
  9 | AGPL-3.0-or-later,34020
 10 | Apache-1.1,2514
 11 | Apache-2.0,10280
 12 | APL-1.0,46065
 13 | APSL-1.0,19644
 14 | APSL-1.1,20151
 15 | APSL-1.2,19796
 16 | APSL-2.0,20281
 17 | Artistic-1.0,4854
 18 | Artistic-1.0-cl8,5184
 19 | Artistic-1.0-Perl,6060
 20 | Artistic-2.0,8764
 21 | BSD-1-Clause,1086
 22 | BSD-2-Clause,1267
 23 | BSD-2-Clause-Patent,2569
 24 | BSD-3-Clause,1460
 25 | BSD-3-Clause-LBNL,2388
 26 | BSL-1.0,1338
 27 | CAL-1.0,16121
 28 | CAL-1.0-Combined-Work-Exception,16121
 29 | CATOSL-1.1,19046
 30 | CDDL-1.0,16419
 31 | CECILL-2.1,21774
 32 | CERN-OHL-P-2.0,8855
 33 | CERN-OHL-S-2.0,13419
 34 | CERN-OHL-W-2.0,14529
 35 | CNRI-Python,3381
 36 | CPAL-1.0,28141
 37 | CPL-1.0,11653
 38 | CUA-OPL-1.0,23381
 39 | ECL-1.0,2425
 40 | ECL-2.0,11111
 41 | EFL-1.0,919
 42 | EFL-2.0,924
 43 | Entessa,2277
 44 | EPL-1.0,11345
 45 | EPL-2.0,13946
 46 | EUDatagrid,3195
 47 | EUPL-1.1,13231
 48 | EUPL-1.2,13648
 49 | Fair,245
 50 | Frameworx-1.0,9771
 51 | GPL-2.0-only,17337
 52 | GPL-2.0-or-later,17337
 53 | GPL-3.0-only,34509
 54 | GPL-3.0-or-later,34509
 55 | HPND,1187
 56 | ICU,1597
 57 | Intel,2078
 58 | IPA,9093
 59 | IPL-1.0,11409
 60 | ISC,823
 61 | Jam,195
 62 | LGPL-2.0-only,24842
 63 | LGPL-2.0-or-later,24842
 64 | LGPL-2.1-only,25967
 65 | LGPL-2.1-or-later,25967
 66 | LGPL-3.0-only,41933
 67 | LGPL-3.0-or-later,41933
 68 | LiLiQ-P-1.1,6351
 69 | LiLiQ-R-1.1,8392
 70 | LiLiQ-Rplus-1.1,8043
 71 | LPL-1.0,11948
 72 | LPL-1.02,11824
 73 | LPPL-1.3c,18575
 74 | MirOS,888
 75 | MIT,1078
 76 | MIT-0,915
 77 | MIT-Modern-Variant,917
 78 | Motosoto,20187
 79 | MPL-1.0,18272
 80 | MPL-1.1,23669
 81 | MPL-2.0,16727
 82 | MPL-2.0-no-copyleft-exception,16727
 83 | MS-PL,2663
 84 | MS-RL,3058
 85 | MulanPSL-2.0,6850
 86 | Multics,2040
 87 | NASA-1.3,13778
 88 | Naumen,1953
 89 | NCSA,1700
 90 | NGPL,4703
 91 | Nokia,21002
 92 | NPOSL-3.0,11799
 93 | NTP,714
 94 | OCLC-2.0,11121
 95 | OFL-1.1,4012
 96 | OFL-1.1-no-RFN,4012
 97 | OFL-1.1-RFN,4012
 98 | OGTSL,5277
 99 | OLDAP-2.8,2195
100 | OLFL-1.3,11401
101 | OSET-PL-2.1,19843
102 | OSL-1.0,8920
103 | OSL-2.0,9880
104 | OSL-2.1,9871
105 | OSL-3.0,10309
106 | PHP-3.0,2846
107 | PHP-3.01,2855
108 | PostgreSQL,1195
109 | Python-2.0,9411
110 | QPL-1.0,4364
111 | RPL-1.1,33931
112 | RPL-1.5,32009
113 | RPSL-1.0,30267
114 | RSCPL,21050
115 | SimPL-2.0,2529
116 | SISSL,14490
117 | Sleepycat,4995
118 | SPL-1.0,23398
119 | UCL-1.0,10556
120 | Unicode-DFS-2016,2857
121 | Unlicense,1211
122 | UPL-1.0,1833
123 | VSL-1.0,2065
124 | W3C,2701
125 | Watcom-1.0,20968
126 | Xnet,1250
127 | Zlib,838
128 | ZPL-2.0,2275
129 | ZPL-2.1,2100
130 | 


--------------------------------------------------------------------------------
/docs/intro/usage_python.rst:
--------------------------------------------------------------------------------
 1 | Python Usage
 2 | ************
 3 | 
 4 | Gimie can be used as a python library. Either to run the end-to-end extraction process on an input URL, or only a specific extractor.
 5 | 
 6 | The end-to-end extraction is performed by ``gimie.Project`` and will automatically detect the git-provider and return directly an `rdflib.Graph` object. After extracting data from the git repository, parsers are executed on the files contents to enrich the graph with additional information.:
 7 | 
 8 | .. code-block:: python
 9 | 
10 |    from gimie.project import Project
11 |    url = 'https://github.com/apache/pulsar'
12 |    proj = Project(url)
13 |    g = proj.extract()
14 | 
15 | 
16 | A specific extractor can also be used, for example to use with GitLab projects:
17 | 
18 | .. code-block:: python
19 | 
20 |    from gimie.extractors import GitlabExtractor
21 |    url = "https://gitlab.com/data-custodian/custodian"
22 |    extractor = GitlabExtractor(url)
23 |    repo = extractor.extract()
24 | 
25 | 
26 | Unlike `Project`, extractors only extract data from the git repository without running any parser, and return a `Repository` object.
27 | 
28 | The `Repository` object can be serialized to RDF or converted to an rdflib graph:
29 | 
30 | .. code-block:: python
31 | 
32 |    type(repo)
33 |    # gimie.models.Repository
34 |    repo.name
35 |    # 'data-custodian/custodian'
36 |    repo.prog_langs
37 |    # ['Go', 'Dockerfile', 'Smarty', 'Shell', 'Makefile']
38 |    repo.serialize(format='json-ld', destination='custodian.json')
39 |    g = repo.to_graph()
40 |    type(g)
41 |    # rdflib.graph.Graph
42 | 
43 | Extractors also have a `list_files()` method which provides handles to a streamable file-like interface for files in the root of the repository.
44 | 
45 | .. code-block:: python
46 | 
47 |    handles = extractor.list_files()
48 |    readme_handle = handles[11]
49 |    readme_handle.path
50 |    # PosixPath('README.md')
51 |    readme_handle.open().readlines()[:2]
52 |    # [b'# The Swiss Data Custodian\n', b'\n']
53 | 
54 | 
55 | Parsers can also be run manually on the files contents:
56 | 
57 | 
58 | .. code-block:: python
59 | 
60 |    from gimie.parsers import LicenseParser
61 |    parser = LicenseParser()
62 |    license_handle = handles[8]
63 |    license_contents = license_handle.open().read()
64 |    parser.parse(license_contents)
65 |    # {(rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://spdx.org/licenses/AGPL-3.0-only.html'))}
66 | 
67 | 
68 | There is also a helper function to run parsers on a list of files,
69 | selecting the correct parser based on file names:
70 | 
71 | .. code-block:: python
72 | 
73 |    from gimie.parsers import parse_files
74 |    parse_files(handles)
75 |    # {(rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://spdx.org/licenses/AGPL-3.0-only.html'))}
76 | 


--------------------------------------------------------------------------------
/gimie/extractors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Gimie
 2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Git providers from which metadata can be extracted by gimie."""
19 | from typing import Dict, Optional, Type
20 | from gimie.extractors.abstract import Extractor
21 | from gimie.extractors.github import GithubExtractor
22 | from gimie.extractors.gitlab import GitlabExtractor
23 | from gimie.extractors.git import GitExtractor
24 | from gimie.utils.uri import validate_url
25 | 
26 | GIT_PROVIDERS: Dict[str, Type[Extractor]] = {
27 |     "git": GitExtractor,
28 |     "github": GithubExtractor,
29 |     "gitlab": GitlabExtractor,
30 | }
31 | 
32 | 
33 | def get_extractor(
34 |     url: str,
35 |     source: str,
36 |     base_url: Optional[str] = None,
37 |     local_path: Optional[str] = None,
38 | ) -> Extractor:
39 |     """Instantiate the correct extractor for a given source.
40 | 
41 |     Parameters
42 |     -----------
43 |     URL
44 |         Where the repository metadata is extracted from.
45 |     source
46 |         The source of the repository (git, gitlab, github, ...).
47 |     base_url
48 |         The base URL of the git remote.
49 |     local_path
50 |         If applicable, the path to the directory where the
51 |         repository is located.
52 | 
53 |     Examples
54 |     --------
55 |     >>> extractor = get_extractor(
56 |     ...     "https://github.com/sdsc-ordes/gimie",
57 |     ...     "github"
58 |     ... )
59 |     """
60 |     try:
61 |         return GIT_PROVIDERS[source](
62 |             url, base_url=base_url, local_path=local_path
63 |         )
64 |     except KeyError as err:
65 |         raise ValueError(
66 |             f"Unknown git provider: {source}.\n"
67 |             f"Supported sources: {', '.join(GIT_PROVIDERS)}"
68 |         ) from err
69 | 
70 | 
71 | def infer_git_provider(url: str) -> str:
72 |     """Given a git repository URL, return the corresponding git provider.
73 |     Local path or unsupported git providers will return "git".
74 | 
75 |     Examples
76 |     --------
77 |     >>> infer_git_provider("https://gitlab.com/foo/bar")
78 |     'gitlab'
79 |     >>> infer_git_provider("/foo/bar")
80 |     'git'
81 |     >>> infer_git_provider("https://codeberg.org/dnkl/foot")
82 |     'git'
83 |     """
84 |     # Fall back to git if local path
85 |     if not validate_url(url):
86 |         return "git"
87 | 
88 |     # NOTE: We just check if the provider name is in the URL.
89 |     # We may want to use a more robust check.
90 |     for name in GIT_PROVIDERS.keys():
91 |         if name in url and name != "git":
92 |             return name
93 | 
94 |     # Fall back to git for unsupported providers
95 |     return "git"
96 | 


--------------------------------------------------------------------------------
/docs/logo_notext.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    width="38.757458mm"
 6 |    height="41.64645mm"
 7 |    viewBox="0 0 38.757458 41.64645"
 8 |    version="1.1"
 9 |    id="svg5"
10 |    inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
11 |    sodipodi:docname="logo_notext.svg"
12 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
13 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
14 |    xmlns="http://www.w3.org/2000/svg"
15 |    xmlns:svg="http://www.w3.org/2000/svg">
16 |   <sodipodi:namedview
17 |      id="namedview7"
18 |      pagecolor="#ffffff"
19 |      bordercolor="#999999"
20 |      borderopacity="1"
21 |      inkscape:showpageshadow="0"
22 |      inkscape:pageopacity="0"
23 |      inkscape:pagecheckerboard="0"
24 |      inkscape:deskcolor="#d1d1d1"
25 |      inkscape:document-units="mm"
26 |      showgrid="false"
27 |      inkscape:zoom="1.3900533"
28 |      inkscape:cx="322.2898"
29 |      inkscape:cy="43.163812"
30 |      inkscape:window-width="3440"
31 |      inkscape:window-height="1440"
32 |      inkscape:window-x="0"
33 |      inkscape:window-y="0"
34 |      inkscape:window-maximized="0"
35 |      inkscape:current-layer="layer1"
36 |      inkscape:pageshadow="2"
37 |      fit-margin-top="0"
38 |      fit-margin-left="0"
39 |      fit-margin-right="0"
40 |      fit-margin-bottom="0" />
41 |   <defs
42 |      id="defs2">
43 |     <rect
44 |        x="233.76202"
45 |        y="325.99402"
46 |        width="127.35226"
47 |        height="52.425407"
48 |        id="rect1040" />
49 |   </defs>
50 |   <g
51 |      inkscape:label="Layer 1"
52 |      inkscape:groupmode="layer"
53 |      id="layer1"
54 |      transform="translate(-10.127538,-68.830064)">
55 |     <path
56 |        style="fill:none;stroke:#757dbc;stroke-width:2.95527;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1"
57 |        d="m 26.513774,73.883416 0.16023,19.21678 M 25.811987,95.124845 13.871884,84.805037"
58 |        id="path1571-3-3"
59 |        sodipodi:nodetypes="cccc" />
60 |     <circle
61 |        style="fill:#757dbc;fill-opacity:1;stroke:none;stroke-width:0.923519;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.409987"
62 |        id="path362-5-6"
63 |        cx="-26.49052"
64 |        cy="-95.272163"
65 |        r="3.7017961"
66 |        transform="scale(-1)" />
67 |     <circle
68 |        style="fill:#757dbc;fill-opacity:1;stroke:none;stroke-width:0.923519;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.409987"
69 |        id="circle1513-6-7"
70 |        cx="-26.663723"
71 |        cy="-72.53186"
72 |        r="3.7017961"
73 |        transform="scale(-1)" />
74 |     <circle
75 |        style="fill:#757dbc;fill-opacity:1;stroke:none;stroke-width:0.923519;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.409987"
76 |        id="circle1515-2-5"
77 |        cx="-13.829334"
78 |        cy="-85.06163"
79 |        r="3.7017961"
80 |        transform="scale(-1)" />
81 |     <path
82 |        style="fill:none;stroke:#757dbc;stroke-width:2.95525;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1"
83 |        d="m 25.770747,94.479968 12.542939,9.169292"
84 |        id="path662"
85 |        sodipodi:nodetypes="cc" />
86 |     <path
87 |        id="path923"
88 |        style="fill:#757dbc;fill-opacity:1;stroke-width:0.889649;stroke-linejoin:round;paint-order:markers fill stroke"
89 |        inkscape:transform-center-x="-2.5977987"
90 |        inkscape:transform-center-y="-4.2976256"
91 |        d="m 40.281616,95.022226 c 0,0 10.696132,8.687344 8.235802,13.277794 -2.46033,4.59042 -15.616792,0.49352 -15.616792,0.49352 z"
92 |        sodipodi:nodetypes="cscc" />
93 |   </g>
94 | </svg>
95 | 


--------------------------------------------------------------------------------
/docs/intro/tokens.rst:
--------------------------------------------------------------------------------
 1 | Token management
 2 | ****************
 3 | 
 4 | Gimie requests data from third party APIs (Gitlab, Github) which require authentication to work. This authentication usually works with Personal Authentication Tokens (PATs). PATs are secret codes that can be used as passwords to perform actions on your behalf, but whose permissions can be limited to specific actions. Since Gimie only consumes data, it will normally work with tokens that have read-only permission.
 5 | 
 6 | Generating tokens can usually be done via the web interface of the service provider, and they must then be provided to Gimie. There are 2 ways to pass your token to Gimie:
 7 | 
 8 | 1. Set the corresponding Environment variable. The token will only be accessible for the current session:
 9 | 
10 | 
11 | .. tab-set::
12 | 
13 |     .. tab-item:: Linux/Mac/BSD
14 |         :selected:
15 | 
16 |         .. code-block:: console
17 |             :emphasize-text: <repository-url>
18 | 
19 |             export GITLAB_TOKEN=<your-gitlab-token>
20 |             export GITHUB_TOKEN=<your-github-token>
21 | 
22 |     .. tab-item:: Windows
23 | 
24 |         .. code-block:: console
25 |             :emphasize-text: <repository-url>
26 | 
27 |             # You may need to restart windows after this
28 |             setx GITLAB_TOKEN <your-gitlab-token>
29 |             setx GITHUB_TOKEN <your-github-token>
30 | 
31 | 
32 | 2. Use a ``.env`` file in the current directory. Gimie will look for a file named ``.env`` and source it. The file contents should be as follows:
33 | 
34 | .. code-block::
35 |     :emphasize-text: <repository-url>
36 |     :caption: File: .env
37 | 
38 |     GITLAB_TOKEN=<your-gitlab-token>
39 |     GITHUB_TOKEN=<your-github-token>
40 | 
41 | 
42 | While the latter approach can be convenient to persist your token locally, it is generally not recommended to store your tokens in plain text as they are sensitive information. Hence the first approach should be preferred in most cases.
43 | 
44 | Encrypting tokens
45 | =================
46 | 
47 | If you are serious about security, you should use a tool like `sops <https://github.com/mozilla/sops>`_ or `pass <https://www.passwordstore.org/>`_ to encrypt your secrets.
48 | 
49 | Below is a quick guide on how to use ``sops`` to store encrypted tokens, and decrypt them on the fly when using gimie.
50 | 
51 | .. dropdown:: Generating PGP key
52 | 
53 |     PGP is a public key encryption system. If you don't already have one, you will need to generate a key pair to encrypt your secrets.
54 |     You can use the following command to generate a key pair. You will be prompted for a passphrase, but you may leave it empty if you wish.
55 | 
56 |     .. code-block:: bash
57 | 
58 |         gpg --gen-key
59 | 
60 | .. dropdown:: Set up SOPS
61 | 
62 |     SOPS needs to be configured to use your PGP key. You can do so by running the following command:
63 |     Replace ``<FINGERPRINT>`` with the fingerprint of your PGP key (it looks like ``69AB B75E ...``). You can find it by running ``gpg --fingerprint``
64 |     Upon running the command below, `sops` will open a `vim` buffer where you can enter the desired content of your .env file.
65 |     Upon saving the file (``:wq``), ``sops`` will encrypt the file and save it as ``.enc.env``.
66 | 
67 |     .. code-block:: bash
68 | 
69 |         sops --pgp "${FINGERPRINT}" .enc.env
70 | 
71 | .. dropdown:: Source tokens
72 | 
73 |     Whenever you want to run gimie, you can decrypt secrets on the fly and pass them to gimie using the following command:
74 | 
75 |     .. code-block:: bash
76 |         :emphasize-text: <repository-url>
77 | 
78 |         sops exec-env .enc.env 'gimie data <repository-url>'
79 | 
80 |     Or if you just want to inspect the decrypted file:
81 | 
82 |     .. code-block:: bash
83 | 
84 |         sops --decrypt .enc.env
85 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | # Package description
  2 | 
  3 | [tool.poetry]
  4 | name = "gimie"
  5 | version = "0.7.2"
  6 | description = "Extract structured metadata from git repositories."
  7 | authors = ["Swiss Data Science Center <contact@datascience.ch>"]
  8 | license = "Apache-2.0"
  9 | homepage = "https://github.com/sdsc-ordes/gimie"
 10 | keywords = ["metadata", "git", "extraction", "linked-data"]
 11 | readme = "README.md"
 12 | classifiers = [
 13 |     "Development Status :: 4 - Beta",
 14 |     "Programming Language :: Python :: 3.9",
 15 |     "Programming Language :: Python :: 3.10",
 16 |     "Programming Language :: Python :: 3.11",
 17 |     "Programming Language :: Python :: 3.12",
 18 |     "Intended Audience :: Science/Research",
 19 |     "Intended Audience :: Developers",
 20 |     "License :: OSI Approved :: Apache Software License",
 21 |     "Operating System :: OS Independent",
 22 | ]
 23 | 
 24 | # Dependency management
 25 | 
 26 | [tool.poetry.dependencies]
 27 | python = ">=3.9,<4.0"
 28 | gitpython = ">=3.1.35"
 29 | PyDriller = "^2.5"
 30 | typer = "^0.7.0"
 31 | calamus = "^0.4.2"
 32 | requests = "^2.28.2"
 33 | python-dotenv = "^0.21.1"
 34 | python-dateutil = "^2.8.2"
 35 | spdx-license-list = "^3.22"
 36 | numpy = "^1.26.1"
 37 | pydantic = "^2.4.2"
 38 | scipy = "^1.11.3"
 39 | pyyaml = "^6.0.2"
 40 | 
 41 | [tool.poetry.group.dev.dependencies]
 42 | black = "^22.10.0"
 43 | coveralls = "^3.3.1"
 44 | pre-commit = "^3.0.0"
 45 | pytest = "^7.2.0"
 46 | pytest-cov = "^4.1.0"
 47 | 
 48 | 
 49 | [tool.poetry.group.doc.dependencies]
 50 | sphinx = "<7.0.0"
 51 | sphinx-click = "^4.4.0"
 52 | sphinxawesome-theme = "^4.1.0"
 53 | sphinx-copybutton = "^0.5.2"
 54 | sphinx-design = "^0.4.1"
 55 | myst-parser = "^1.0.0"
 56 | 
 57 | [build-system]
 58 | requires = ["poetry-core"]
 59 | build-backend = "poetry.core.masonry.api"
 60 | 
 61 | [tool.poetry.scripts]
 62 | gimie = 'gimie.cli:app'
 63 | 
 64 | 
 65 | # Tooling configuration
 66 | 
 67 | [tool.black]
 68 | line-length = 79
 69 | target-version = ["py38", "py39"]
 70 | 
 71 | [tool.pytest.ini_options]
 72 | addopts = ["--doctest-modules", "--cov"]
 73 | testpaths = ["gimie", "tests"]
 74 | 
 75 | [tool.pyright]
 76 | reportMissingTypeStubs = false
 77 | reportUntypedBaseClass = false
 78 | 
 79 | [tool.git-cliff.changelog]
 80 | header = "Notable changes introduced in gimie releases are documented in this file\n\n"
 81 | body = """
 82 | 
 83 | {% if version %}\
 84 |     ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }}
 85 | {% else %}\
 86 |     ## [unreleased]
 87 | {% endif %}\
 88 | {% for group, commits in commits | group_by(attribute="group") %}
 89 |     ### {{ group | upper_first }}
 90 |     {% for commit in commits
 91 |     | filter(attribute="scope")
 92 |     | sort(attribute="scope") %}
 93 |         - *({{commit.scope}})* {{ commit.message }}
 94 |         {%- if commit.breaking %}
 95 |         {% raw %}  {% endraw %}- **BREAKING**: {{commit.breaking_description}}
 96 |         {%- endif -%}
 97 |     {%- endfor -%}
 98 |     {%- for commit in commits %}
 99 |         {%- if commit.scope -%}
100 |         {% else -%}
101 |             - {{ commit.message }}
102 |             {% if commit.breaking -%}
103 |             {% raw %}  {% endraw %}- **BREAKING**: {{commit.breaking_description}}
104 |             {% endif -%}
105 |         {% endif -%}
106 |     {% endfor -%}
107 |     {% raw %}\n{% endraw %}\
108 | {% endfor %}\n
109 | """
110 | footer = "<!--generated by git-cliff -->"
111 | 
112 | [tool.git-cliff.git]
113 | conventional_commits = true
114 | filter_commits = true
115 | commit_parsers = [
116 |     { message = "^feat", group = "Features" },
117 |     { message = "^(fix|bug)", group = "Bug Fixes" },
118 |     { message = "^doc", group = "Documentation" },
119 | ]
120 | 
121 | commit_preprocessors = [
122 |     { pattern = 'Merged PR #[0-9]: (.*)', replace = "$1" },
123 |     { pattern = " +", replace = " " },
124 | ]
125 | 


--------------------------------------------------------------------------------
/gimie/project.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Orchestration of multiple extractors for a given project.
 18 | This is the main entry point for end-to-end analysis."""
 19 | from typing import Iterable, Optional, Tuple
 20 | 
 21 | from rdflib import Graph
 22 | from rdflib.term import URIRef
 23 | from urllib.parse import urlparse
 24 | 
 25 | from gimie.extractors import get_extractor, infer_git_provider
 26 | from gimie.graph.operations import properties_to_graph
 27 | from gimie.parsers import parse_files
 28 | from gimie.utils.uri import validate_url
 29 | 
 30 | 
 31 | class Project:
 32 |     """A class to represent a project's git repository.
 33 | 
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     path:
 38 |         The full path (URL) of the repository.
 39 |     base_url:
 40 |         The base URL of the git remote. Can be used to
 41 |         specify delimitation between base URL and project name.
 42 |     git_provider:
 43 |         The name of the git provider to extract metadata from.
 44 |         ('git', 'github', 'gitlab')
 45 |     parser_names:
 46 |         Names of file parsers to use. ('license').
 47 |         If None, default parsers are used (see gimie.parsers.PARSERS).
 48 | 
 49 |     Examples
 50 |     --------
 51 |     >>> proj = Project("https://github.com/sdsc-ordes/gimie")
 52 |     >>> assert isinstance(proj.extract(), Graph)
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         path: str,
 58 |         base_url: Optional[str] = None,
 59 |         git_provider: Optional[str] = None,
 60 |         parser_names: Optional[Iterable[str]] = None,
 61 |     ):
 62 |         if not git_provider:
 63 |             git_provider = infer_git_provider(path)
 64 | 
 65 |         self.base_url = base_url
 66 |         self.project_dir = None
 67 |         self._cloned = False
 68 |         if validate_url(path):
 69 |             self.url = path
 70 |         else:
 71 |             self.project_dir = path
 72 | 
 73 |         self.extractor = get_extractor(
 74 |             self.url,
 75 |             git_provider,
 76 |             base_url=self.base_url,
 77 |             local_path=self.project_dir,
 78 |         )
 79 |         if parser_names:
 80 |             self.parsers = set(parser_names)
 81 |         else:
 82 |             self.parsers = None
 83 | 
 84 |     def extract(self) -> Graph:
 85 |         """Extract repository metadata from git provider to RDF graph and enrich with
 86 |         metadata parsed from file contents."""
 87 | 
 88 |         repo = self.extractor.extract()
 89 |         repo_graph = repo.to_graph()
 90 | 
 91 |         files = self.extractor.list_files()
 92 |         parsed_graph = parse_files(self.url, files, self.parsers)
 93 | 
 94 |         repo_graph += parsed_graph
 95 |         return repo_graph
 96 | 
 97 | 
 98 | def split_git_url(url: str) -> Tuple[str, str]:
 99 |     """Split a git URL into base URL and project path.
100 | 
101 |     Examples
102 |     --------
103 |     >>> split_git_url("https://gitlab.com/foo/bar")
104 |     ('https://gitlab.com', 'foo/bar')
105 |     """
106 |     base_url = urlparse(url).scheme + "://" + urlparse(url).netloc
107 |     project = urlparse(url).path.strip("/")
108 |     return base_url, project
109 | 


--------------------------------------------------------------------------------
/gimie/parsers/__init__.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Files which can be parsed by gimie."""
 18 | from pathlib import Path
 19 | from typing import Iterable, NamedTuple, Optional, Set, Type
 20 | 
 21 | from gimie.graph import Property
 22 | from gimie.io import Resource
 23 | from gimie.parsers.abstract import Parser
 24 | from gimie.parsers.license import LicenseParser, is_license_filename
 25 | from gimie.parsers.cff import CffParser
 26 | 
 27 | from rdflib import Graph
 28 | 
 29 | 
 30 | class ParserInfo(NamedTuple):
 31 |     default: bool
 32 |     type: Type[Parser]
 33 | 
 34 | 
 35 | PARSERS = {
 36 |     "license": ParserInfo(default=True, type=LicenseParser),
 37 |     "cff": ParserInfo(default=True, type=CffParser),
 38 | }
 39 | 
 40 | 
 41 | def get_parser(name: str) -> Type[Parser]:
 42 |     """Get a parser by name."""
 43 |     parser = PARSERS.get(name, None)
 44 |     if parser is None:
 45 |         raise ValueError(
 46 |             f"Unknown parser: {name}.\n"
 47 |             f"Supported parsers: {', '.join(PARSERS)}"
 48 |         )
 49 |     return parser.type
 50 | 
 51 | 
 52 | def list_default_parsers() -> Set[str]:
 53 |     """List the names of all default parsers."""
 54 |     return {k for k, v in PARSERS.items() if v.default}
 55 | 
 56 | 
 57 | def list_parsers() -> Set[str]:
 58 |     """List the names of all parsers."""
 59 |     return set(PARSERS.keys())
 60 | 
 61 | 
 62 | def select_parser(
 63 |     path: Path,
 64 |     parsers: Optional[Set[str]] = None,
 65 | ) -> Optional[Type[Parser]]:
 66 |     """Select the appropriate parser from a collection based on a file path.
 67 |     If no parser is found, return None.
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     path:
 72 |         The path of the file to parse.
 73 |     parsers:
 74 |         A set of parser names. If None, use the default collection.
 75 |     """
 76 |     # Only parse licenses and citations in the root directory
 77 |     if is_license_filename(path.name) and len(path.parts) == 1:
 78 |         name = "license"
 79 |     elif path.name == "CITATION.cff" and len(path.parts) == 1:
 80 |         name = "cff"
 81 |     else:
 82 |         return None
 83 | 
 84 |     if name not in (parsers or list_parsers()):
 85 |         return None
 86 |     return get_parser(name)
 87 | 
 88 | 
 89 | def parse_files(
 90 |     subject: str,
 91 |     files: Iterable[Resource],
 92 |     parsers: Optional[Set[str]] = None,
 93 | ) -> Graph:
 94 |     """For each input file, select appropriate parser among a collection and
 95 |     parse its contents. Return the union of all parsed properties in the form of triples.
 96 |     If no parser is found for a given file, skip it.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     subject:
101 |         The subject URI of the repository.
102 |     files:
103 |         A collection of file-like objects.
104 |     parsers:
105 |         A set of parser names. If None, use the default collection.
106 |     """
107 |     parsed_properties = Graph()
108 |     for file in files:
109 |         parser = select_parser(file.path, parsers)
110 |         if not parser:
111 |             continue
112 |         data = file.open().read()
113 |         parsed_properties |= parser(subject).parse(data or b"")
114 |     return parsed_properties
115 | 


--------------------------------------------------------------------------------
/gimie/io.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Standard input interfaces to local or remote resources for gimie."""
 18 | 
 19 | import io
 20 | import os
 21 | from pathlib import Path
 22 | import requests
 23 | from typing import Iterator, Optional, Union
 24 | 
 25 | 
 26 | class Resource:
 27 |     """Abstract class for read-only access to local or remote resources via
 28 |     a file-like interface.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     path:
 33 |         The local relative path to the resource.
 34 |     """
 35 | 
 36 |     path: Path
 37 | 
 38 |     def open(self) -> io.RawIOBase:
 39 |         raise NotImplementedError
 40 | 
 41 | 
 42 | class LocalResource(Resource):
 43 |     """Providing read-only access to local data via a file-like interface.
 44 | 
 45 |     Examples
 46 |     --------
 47 |     >>> resource = LocalResource("README.md")
 48 |     """
 49 | 
 50 |     def __init__(self, path: Union[str, os.PathLike]):
 51 |         self.path: Path = Path(path)
 52 | 
 53 |     def open(self) -> io.RawIOBase:
 54 |         return io.FileIO(self.path, mode="r")
 55 | 
 56 | 
 57 | class RemoteResource(Resource):
 58 |     """Provides read-only access to remote data via a file-like interface.
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |     url:
 63 |         The URL where the resource. can be downladed from.
 64 |     headers:
 65 |         Optional headers to pass to the request.
 66 | 
 67 |     Examples
 68 |     --------
 69 |     >>> url = "https://raw.githubusercontent.com/sdsc-ordes/gimie/main/README.md"
 70 |     >>> content = RemoteResource("README.md", url).open().read()
 71 |     >>> assert isinstance(content, bytes)
 72 |     """
 73 | 
 74 |     def __init__(self, path: str, url: str, headers: Optional[dict] = None):
 75 |         self.path = Path(path)
 76 |         self.url = url
 77 |         self.headers = headers or {}
 78 | 
 79 |     def open(self) -> io.RawIOBase:
 80 |         resp = requests.get(
 81 |             self.url, headers=self.headers, stream=True
 82 |         ).iter_content(chunk_size=128)
 83 |         return IterStream(resp)
 84 | 
 85 | 
 86 | class IterStream(io.RawIOBase):
 87 |     """Wraps an iterator under a like a file-like interface.
 88 |     Empty elements in the iterator are ignored.
 89 | 
 90 |     Parameters
 91 |     ----------
 92 |     iterator:
 93 |         An iterator yielding bytes.
 94 | 
 95 |     Examples
 96 |     --------
 97 |     >>> stream = IterStream(iter([b"Hello ", b"", b"World"]))
 98 |     >>> stream.read()
 99 |     b'Hello World'
100 |     """
101 | 
102 |     def __init__(self, iterator: Iterator[bytes]):
103 |         self.leftover = b""
104 |         self.iterator = iterator
105 | 
106 |     def readable(self):
107 |         return True
108 | 
109 |     def readinto(self, b):
110 |         try:
111 |             l = len(b)  # We're supposed to return at most this much
112 |             while True:
113 |                 chunk = self.leftover or next(self.iterator)
114 |                 # skip empty elements
115 |                 if not chunk:
116 |                     continue
117 |                 output, self.leftover = chunk[:l], chunk[l:]
118 |                 b[: len(output)] = output
119 |                 return len(output)
120 |         except StopIteration:
121 |             return 0  # indicate EOF
122 | 


--------------------------------------------------------------------------------
/gimie/utils/uri.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Utility functions used throughout gimie."""
 18 | 
 19 | from typing import List, Literal
 20 | from urllib.parse import urlparse
 21 | import re
 22 | 
 23 | from gimie.graph.namespaces import GIMIE
 24 | 
 25 | 
 26 | def validate_url(url: str):
 27 |     """Checks if input is a valid URL.
 28 |     credits: https://stackoverflow.com/a/38020041
 29 | 
 30 |     Examples
 31 |     -------------
 32 |     >>> validate_url('/data/my_repo')
 33 |     False
 34 |     >>> validate_url(532)
 35 |     False
 36 |     >>> validate_url('https://www.github.com/sdsc-ordes/gimie')
 37 |     True
 38 |     >>> validate_url('github.com/sdsc-ordes/gimie')
 39 |     False
 40 |     """
 41 |     try:
 42 |         result = urlparse(url)
 43 |         return all([result.scheme, result.netloc])
 44 |     except AttributeError:
 45 |         return False
 46 | 
 47 | 
 48 | def generate_uri(ref: str):
 49 |     """Given a reference (e.g. commit sha), return a URI.
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     path:
 54 |         Path to the repository, either local or a URL.
 55 | 
 56 | 
 57 |     Returns
 58 |     -------
 59 |     fair_uri:
 60 |         A unique resource identifier (URI) for the repository path.
 61 | 
 62 |     Examples
 63 |     --------
 64 |     >>> generate_uri("abc")
 65 |     'https://sdsc-ordes.github.io/gimie/abc'
 66 |     """
 67 |     return str(GIMIE[ref])
 68 | 
 69 | 
 70 | def is_valid_orcid(orcid):
 71 |     """Check if the input is a valid ORCID according to definition from orcid.org [1]_.
 72 |     .. [1] [https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier](https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier)
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     orcid:
 77 |         The ORCID to validate.
 78 | 
 79 |     Returns
 80 |     -------
 81 |     bool:
 82 |         True if the ORCID is valid, False otherwise.
 83 | 
 84 |     Examples
 85 |     --------
 86 |     >>> is_valid_orcid("https://orcid.org/0000-0001-2345-6789")
 87 |     True
 88 |     >>> is_valid_orcid("0000-0001-2345-6789")
 89 |     False
 90 |     >>> is_valid_orcid("http://orcid.org/0000-0001-2345-6789")
 91 |     False
 92 | 
 93 |     """
 94 |     return bool(
 95 |         re.match(
 96 |             r"(https:\/\/)?orcid.org\/\d{4}-\d{4}-\d{4}-\d{4}", str(orcid)
 97 |         )
 98 |     )
 99 | 
100 | 
101 | def extract_doi_match(doi):
102 |     """Extracts doi from the input if it contains a valid DOI according to definition from crossref.org [1]_.
103 |     .. [1] [https://www.crossref.org/blog/dois-and-matching-regular-expressions](https://www.crossref.org/blog/dois-and-matching-regular-expressions)
104 | 
105 |     Parameters
106 |     ----------
107 |     doi:
108 |         The DOI to validate.
109 | 
110 |     Returns
111 |     -------
112 |     str:
113 |         The extracted short DOI if it is valid, None otherwise.
114 | 
115 |     Examples
116 |     --------
117 |     >>> extract_doi_match("10.5281/zenodo.1234567")
118 |     '10.5281/zenodo.1234567'
119 |     >>> extract_doi_match("https://doi.org/10.5281/zenodo.1234567")
120 |     '10.5281/zenodo.1234567'
121 |     """
122 |     match = re.search(
123 |         r"10.\d{4,9}/[-._;()/:A-Z0-9]+$", doi, flags=re.IGNORECASE
124 |     )
125 |     if match:
126 |         return match.group()
127 | 


--------------------------------------------------------------------------------
/docs/logo.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    width="85.293655mm"
  6 |    height="41.64645mm"
  7 |    viewBox="0 0 85.293656 41.64645"
  8 |    version="1.1"
  9 |    id="svg5"
 10 |    inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
 11 |    sodipodi:docname="gimie_logo.svg"
 12 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 13 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 14 |    xmlns="http://www.w3.org/2000/svg"
 15 |    xmlns:svg="http://www.w3.org/2000/svg">
 16 |   <sodipodi:namedview
 17 |      id="namedview7"
 18 |      pagecolor="#ffffff"
 19 |      bordercolor="#999999"
 20 |      borderopacity="1"
 21 |      inkscape:showpageshadow="0"
 22 |      inkscape:pageopacity="0"
 23 |      inkscape:pagecheckerboard="0"
 24 |      inkscape:deskcolor="#d1d1d1"
 25 |      inkscape:document-units="mm"
 26 |      showgrid="false"
 27 |      inkscape:zoom="1.3900533"
 28 |      inkscape:cx="322.2898"
 29 |      inkscape:cy="43.163812"
 30 |      inkscape:window-width="3440"
 31 |      inkscape:window-height="1440"
 32 |      inkscape:window-x="0"
 33 |      inkscape:window-y="0"
 34 |      inkscape:window-maximized="0"
 35 |      inkscape:current-layer="layer1"
 36 |      inkscape:pageshadow="2"
 37 |      fit-margin-top="0"
 38 |      fit-margin-left="0"
 39 |      fit-margin-right="0"
 40 |      fit-margin-bottom="0" />
 41 |   <defs
 42 |      id="defs2">
 43 |     <rect
 44 |        x="233.76202"
 45 |        y="325.99402"
 46 |        width="127.35226"
 47 |        height="52.425407"
 48 |        id="rect1040" />
 49 |   </defs>
 50 |   <g
 51 |      inkscape:label="Layer 1"
 52 |      inkscape:groupmode="layer"
 53 |      id="layer1"
 54 |      transform="translate(-10.127538,-68.830064)">
 55 |     <path
 56 |        style="fill:none;stroke:#757dbc;stroke-width:2.95527;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1"
 57 |        d="m 26.513774,73.883416 0.16023,19.21678 M 25.811987,95.124845 13.871884,84.805037"
 58 |        id="path1571-3-3"
 59 |        sodipodi:nodetypes="cccc" />
 60 |     <circle
 61 |        style="fill:#757dbc;fill-opacity:1;stroke:none;stroke-width:0.923519;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.409987"
 62 |        id="path362-5-6"
 63 |        cx="-26.49052"
 64 |        cy="-95.272163"
 65 |        r="3.7017961"
 66 |        transform="scale(-1)" />
 67 |     <circle
 68 |        style="fill:#757dbc;fill-opacity:1;stroke:none;stroke-width:0.923519;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.409987"
 69 |        id="circle1513-6-7"
 70 |        cx="-26.663723"
 71 |        cy="-72.53186"
 72 |        r="3.7017961"
 73 |        transform="scale(-1)" />
 74 |     <circle
 75 |        style="fill:#757dbc;fill-opacity:1;stroke:none;stroke-width:0.923519;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.409987"
 76 |        id="circle1515-2-5"
 77 |        cx="-13.829334"
 78 |        cy="-85.06163"
 79 |        r="3.7017961"
 80 |        transform="scale(-1)" />
 81 |     <path
 82 |        style="fill:none;stroke:#757dbc;stroke-width:2.95525;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1"
 83 |        d="m 25.770747,94.479968 12.542939,9.169292"
 84 |        id="path662"
 85 |        sodipodi:nodetypes="cc" />
 86 |     <path
 87 |        id="path923"
 88 |        style="fill:#757dbc;fill-opacity:1;stroke-width:0.889649;stroke-linejoin:round;paint-order:markers fill stroke"
 89 |        inkscape:transform-center-x="-2.5977987"
 90 |        inkscape:transform-center-y="-4.2976256"
 91 |        d="m 40.281616,95.022226 c 0,0 10.696132,8.687344 8.235802,13.277794 -2.46033,4.59042 -15.616792,0.49352 -15.616792,0.49352 z"
 92 |        sodipodi:nodetypes="cscc" />
 93 |     <text
 94 |        xml:space="preserve"
 95 |        transform="matrix(0.51621873,0,0,0.51621873,-88.709589,-99.15924)"
 96 |        id="text1038"
 97 |        style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect1040);display:inline;fill:#818181;fill-opacity:1;stroke:none"><tspan
 98 |          x="233.76172"
 99 |          y="361.38477"
100 |          id="tspan2343"><tspan
101 |            style="font-weight:bold"
102 |            id="tspan2341">gimie</tspan></tspan></text>
103 |   </g>
104 | </svg>
105 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
  1 | name: Publish on Github container registry
  2 | 
  3 | on:
  4 |   release:
  5 |     type: [published]
  6 |   push:
  7 |     branches: [main]
  8 |   pull_request:
  9 |     paths:
 10 |       - 'pyproject.toml'
 11 |       - './docker/**'
 12 |       - '.github/workflows/**'
 13 | 
 14 | env:
 15 |   REGISTRY: ghcr.io
 16 | 
 17 | jobs:
 18 |   build-image:
 19 |     runs-on: ubuntu-latest
 20 |     if: github.ref != 'refs/heads/main'
 21 |     permissions:
 22 |       contents: read
 23 |       packages: write
 24 | 
 25 |     steps:
 26 |       # https://github.com/actions/checkout
 27 |       - name: checkout repository
 28 |         uses: actions/checkout@v4
 29 | 
 30 |       - name: lowercase image name
 31 |         run: |
 32 |           echo "IMAGE_NAME=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV}
 33 | 
 34 |       # https://github.com/docker/setup-qemu-action
 35 |       - name: Set up QEMU
 36 |         uses: docker/setup-qemu-action@v3.0.0
 37 | 
 38 |       # https://github.com/docker/setup-buildx-action
 39 |       - name: Set up Docker Buildx
 40 |         id: buildx
 41 |         uses: docker/setup-buildx-action@v3.0.0
 42 | 
 43 |       - name: Get current release version
 44 |         id: release-version
 45 |         run: |
 46 |           version=$(grep -E '^version += +' pyproject.toml | sed -E 's/.*= +//' | sed "s/['\"]//g")
 47 |           echo "version=${version}" >> $GITHUB_OUTPUT
 48 |           echo "version_build=${version}_"$(git rev-parse --short "$GITHUB_SHA") >> $GITHUB_OUTPUT
 49 | 
 50 |       # https://github.com/docker/build-push-action
 51 |       - name: Build Docker image
 52 |         uses: docker/build-push-action@v5.0.0
 53 |         with:
 54 |           context: .
 55 |           platforms: linux/amd64,linux/arm64
 56 |           file: .docker/Dockerfile
 57 |           push: false
 58 |           tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.release-version.outputs.version_build }}
 59 |           build-args: VERSION_BUILD=${{ steps.release-version.outputs.version_build }}
 60 |           outputs: type=image,annotation-index.org.opencontainers.image.description=Extract linked metadata from repositories.
 61 | 
 62 |   push-image:
 63 |     runs-on: ubuntu-latest
 64 |     if: github.ref == 'refs/heads/main'
 65 |     permissions:
 66 |       contents: read
 67 |       packages: write
 68 | 
 69 |     steps:
 70 |       # https://github.com/actions/checkout
 71 |       - name: checkout repository
 72 |         uses: actions/checkout@v4
 73 | 
 74 |       - name: lowercase image name
 75 |         run: |
 76 |           echo "IMAGE_NAME=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV}
 77 | 
 78 |       # https://github.com/docker/setup-qemu-action
 79 |       - name: Set up QEMU
 80 |         uses: docker/setup-qemu-action@v3.0.0
 81 | 
 82 |       # https://github.com/docker/setup-buildx-action
 83 |       - name: Set up Docker Buildx
 84 |         id: buildx
 85 |         uses: docker/setup-buildx-action@v3.0.0
 86 | 
 87 |       - name: Get current release version
 88 |         id: release-version
 89 |         run: |
 90 |           version=$(grep -E '^version += +' pyproject.toml | sed -E 's/.*= +//' | sed "s/['\"]//g")
 91 |           echo "version=${version}" >> $GITHUB_OUTPUT
 92 |           echo "version_build=${version}_"$(git rev-parse --short "$GITHUB_SHA") >> $GITHUB_OUTPUT
 93 | 
 94 |       # https://github.com/docker/login-action
 95 |       - name: Log in to the Container registry
 96 |         uses: docker/login-action@v3.0.0
 97 |         with:
 98 |           registry: ${{ env.REGISTRY }}
 99 |           username: ${{ github.actor }}
100 |           password: ${{ secrets.GITHUB_TOKEN }}
101 | 
102 |       # https://github.com/docker/metadata-action
103 |       - name: Extract metadata (tags, labels) for Docker
104 |         id: meta
105 |         uses: docker/metadata-action@v5.0.0
106 |         with:
107 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
108 |           tags: |
109 |             type=raw,value=latest,enable=${{ github.event_name == 'push' }}
110 |             type=raw,value=${{ needs.build-image.outputs.version_build }},enable=${{ github.event_name == 'push' }}
111 |             type=raw,value=${{ needs.build-image.outputs.version }},enable=${{ github.event_name == 'release' }}
112 | 
113 |       # https://github.com/docker/build-push-action
114 |       - name: Push Docker image
115 |         uses: docker/build-push-action@v5.0.0
116 |         with:
117 |           context: .
118 |           platforms: linux/amd64,linux/arm64
119 |           file: .docker/Dockerfile
120 |           push: true
121 |           tags: ${{ steps.meta.outputs.tags }}
122 |           labels: ${{ steps.meta.outputs.labels }}
123 |           build-args: VERSION_BUILD=${{ needs.build-image.outputs.version_build }}
124 |           outputs: type=image,annotation-index.org.opencontainers.image.description=Extract linked metadata from repositories.
125 | 


--------------------------------------------------------------------------------
/gimie/cli.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Command line interface to the gimie package."""
 18 | from enum import Enum
 19 | from typing import List, Optional
 20 | 
 21 | import click
 22 | import typer
 23 | 
 24 | from gimie import __version__
 25 | from gimie.parsers import get_parser, list_default_parsers, list_parsers
 26 | from gimie.project import Project
 27 | 
 28 | app = typer.Typer(add_completion=False)
 29 | 
 30 | 
 31 | # Used to autogenerate docs with sphinx-click
 32 | @click.group()
 33 | def cli():
 34 |     """Command line group"""
 35 |     pass
 36 | 
 37 | 
 38 | class RDFFormatChoice(str, Enum):
 39 |     ttl = "ttl"
 40 |     jsonld = "json-ld"
 41 |     nt = "nt"
 42 | 
 43 | 
 44 | def version_callback(value: bool):
 45 |     if value:
 46 |         print(f"gimie {__version__}")
 47 |         # Exits successfully
 48 |         raise typer.Exit()
 49 | 
 50 | 
 51 | @app.command()
 52 | def data(
 53 |     url: str,
 54 |     format: RDFFormatChoice = typer.Option(
 55 |         RDFFormatChoice.ttl,
 56 |         "--format",
 57 |         show_choices=True,
 58 |         help="Output serialization format for the RDF graph.",
 59 |     ),
 60 |     base_url: Optional[str] = typer.Option(
 61 |         None,
 62 |         "--base-url",
 63 |         help="Specify the base URL of the git provider. Inferred by default.",
 64 |     ),
 65 |     include_parser: Optional[List[str]] = typer.Option(
 66 |         None,
 67 |         "--include-parser",
 68 |         "-I",
 69 |         help="Only include selected parser. Use 'gimie parsers' to list parsers.",
 70 |     ),
 71 |     exclude_parser: Optional[List[str]] = typer.Option(
 72 |         None,
 73 |         "--exclude-parser",
 74 |         "-X",
 75 |         help="Exclude selected parser.",
 76 |     ),
 77 |     version: Optional[bool] = typer.Option(
 78 |         None,
 79 |         "--version",
 80 |         help="Display version and exit",
 81 |         callback=version_callback,
 82 |     ),
 83 | ):
 84 |     """Extract linked metadata from a Git repository at the target URL.
 85 | 
 86 |     The output is sent to stdout, and turtle is used as the default serialization format.
 87 |     """
 88 |     parser_names = list_default_parsers()
 89 |     if exclude_parser:
 90 |         parser_names -= set([parser for parser in exclude_parser])
 91 |     if include_parser:
 92 |         parser_names = set([parser for parser in include_parser])
 93 |     proj = Project(url, base_url=base_url, parser_names=parser_names)
 94 |     repo_meta = proj.extract()
 95 |     print(repo_meta.serialize(format=format.value))
 96 | 
 97 | 
 98 | @app.command()
 99 | def advice(url: str):
100 |     """Show a metadata completion report for a Git repository
101 |     at the target URL.
102 | 
103 |     NOTE: Not implemented yet"""
104 |     ...
105 |     raise typer.Exit()
106 | 
107 | 
108 | @app.command()
109 | def parsers(
110 |     verbose: bool = typer.Option(
111 |         False, "--verbose", help="Show parser description."
112 |     )
113 | ):
114 |     """List available parsers, specifying which are default.
115 |     If --verbose is used, show parser description."""
116 |     message = ""
117 |     parsers = list_parsers()
118 |     default_parsers = list_default_parsers()
119 | 
120 |     for name in parsers:
121 |         # Each parser gets their name in bold green
122 |         title = typer.style(name, fg=typer.colors.GREEN, bold=True)
123 |         default = " (default)" if name in default_parsers else ""
124 |         description = f" - {get_parser(name).__doc__}" if verbose else ""
125 | 
126 |         parser_line = f"{title}{default}{description}"
127 |         message += f"{parser_line}\n"
128 | 
129 |     typer.echo(message)
130 | 
131 | 
132 | typer_cli = typer.main.get_command(app)
133 | cli.add_command(typer_cli, "cli")
134 | 
135 | 
136 | # This callback is triggered when gimie is called without subcommand
137 | @app.callback()
138 | def callback(
139 |     version: Optional[bool] = typer.Option(
140 |         None, "--version", callback=version_callback
141 |     )
142 | ):
143 |     """gimie digs Git repositories for metadata."""
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     app()
148 | 


--------------------------------------------------------------------------------
/gimie/parsers/license/__init__.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | import csv
 18 | from io import BytesIO
 19 | import pkgutil
 20 | import re
 21 | from typing import List, Optional, Set
 22 | 
 23 | import numpy as np
 24 | import scipy.sparse as sp
 25 | from rdflib.term import URIRef
 26 | from rdflib import Graph
 27 | from gimie.graph.namespaces import SDO
 28 | from gimie.parsers.abstract import Parser, Property
 29 | from gimie.utils.text_processing import TfidfVectorizer
 30 | 
 31 | 
 32 | class LicenseParser(Parser):
 33 |     """Parse LICENSE body into schema:license <spdx-url>.
 34 |     Uses tf-idf-based matching."""
 35 | 
 36 |     def __init__(self, subject: str):
 37 |         super().__init__(subject)
 38 | 
 39 |     def parse(self, data: bytes) -> Graph:
 40 |         """Extracts an spdx URL from a license file and returns a
 41 |         graph with a single triple <url> <schema:license> <spdx_url>.
 42 |         If no matching URL is found, an empty graph is returned.
 43 |         """
 44 |         license_facts = Graph()
 45 |         license_url = match_license(data)
 46 | 
 47 |         if license_url:
 48 |             license_facts.add((self.subject, SDO.license, URIRef(license_url)))
 49 |         return license_facts
 50 | 
 51 | 
 52 | def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]:
 53 |     """Given a license file, returns the url of the most similar spdx license.
 54 |     This is done using TF-IDF on the license text and getting the
 55 |     closest match in the SPDX license corpus based on cosine similarity.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     data:
 60 |         The license body as bytes.
 61 | 
 62 |     Examples
 63 |     --------
 64 |     >>> match_license(open('LICENSE', 'rb').read())
 65 |     'https://spdx.org/licenses/Apache-2.0.html'
 66 |     """
 67 |     # Compute tfidf vector for input license
 68 |     vectorizer = load_tfidf_vectorizer()
 69 |     input_vec = vectorizer.transform([data.decode()])
 70 | 
 71 |     # Load ids and tfidf vectors for spdx licenses
 72 |     spdx_licenses = load_spdx_ids()
 73 |     spdx_vecs = load_tfidf_matrix()
 74 |     # Compute cosine similarity between input_vec and spdx vectors
 75 |     sim: np.ndarray = (input_vec * spdx_vecs.T).todense()
 76 |     # Pick the most similar spdx vector
 77 |     closest_idx = np.argmax(sim)
 78 |     # If similarity is below threshold, return None
 79 |     if sim[0, closest_idx] < min_similarity:
 80 |         return None
 81 |     closest_id = spdx_licenses[closest_idx]
 82 |     return f"https://spdx.org/licenses/{closest_id}.html"
 83 | 
 84 | 
 85 | def load_tfidf_vectorizer() -> TfidfVectorizer:
 86 |     """Load tfidf matrix and vectorizer from disk."""
 87 | 
 88 |     data = pkgutil.get_data(__name__, "data/tfidf_vectorizer.json")
 89 |     if data is None:
 90 |         raise FileNotFoundError("Could not find tfidf_vectorizer.json")
 91 |     return TfidfVectorizer.model_validate_json(data)
 92 | 
 93 | 
 94 | def load_spdx_ids() -> List[str]:
 95 |     """Load spdx licenses from disk."""
 96 |     data = pkgutil.get_data(__name__, "data/spdx_licenses.csv")
 97 |     if data is None:
 98 |         raise FileNotFoundError("Could not find spdx_licenses.csv")
 99 |     reader = csv.reader(data.decode().split("\n"))
100 |     return [l[0] for l in reader if l]
101 | 
102 | 
103 | def load_tfidf_matrix() -> sp.csr_matrix:
104 |     """Load pre-computed tfidf matrix of spdx licenses from disk.
105 |     Matrix has dimensions (n_licenses, n_features)."""
106 |     data = pkgutil.get_data(__name__, "data/tfidf_matrix.npz")
107 |     if data is None:
108 |         raise FileNotFoundError("Could not find tfidf_matrix.npz")
109 |     return sp.load_npz(BytesIO(data))
110 | 
111 | 
112 | def is_license_filename(filename: str) -> bool:
113 |     """Given an input filename, returns a boolean indicating whether the filename path looks like a license.
114 | 
115 |     Parameters
116 |     ----------
117 |     filename:
118 |         A filename to check.
119 | 
120 |     Examples
121 |     --------
122 |     >>> is_license_filename('LICENSE-APACHE')
123 |     True
124 |     >>> is_license_filename('README.md')
125 |     False
126 |     """
127 |     if filename.startswith("."):
128 |         return False
129 |     pattern = r".*(license(s)?.*|lizenz|reus(e|ing).*|copy(ing)?.*)(\.(txt|md|rst))?$"
130 |     if re.match(pattern, filename, flags=re.IGNORECASE):
131 |         return True
132 |     return False
133 | 


--------------------------------------------------------------------------------
/gimie/extractors/git.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Extractor which uses a locally available (usually cloned) repository."""
 18 | from dataclasses import dataclass
 19 | from datetime import datetime
 20 | from functools import cached_property
 21 | import os
 22 | import shutil
 23 | import tempfile
 24 | from typing import List, Optional
 25 | import uuid
 26 | 
 27 | import git
 28 | import pydriller
 29 | 
 30 | from gimie.io import LocalResource
 31 | from gimie.models import Person, Repository
 32 | from gimie.extractors.abstract import Extractor
 33 | from pathlib import Path
 34 | 
 35 | 
 36 | @dataclass
 37 | class GitExtractor(Extractor):
 38 |     """
 39 |     This class is responsible for extracting metadata from a git repository.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     url: str
 44 |         The url of the git repository.
 45 |     base_url: Optional[str]
 46 |         The base url of the git remote.
 47 |     local_path: Optional[str]
 48 |         The local path where the cloned git repository is located.
 49 | 
 50 |     Attributes
 51 |     ----------
 52 |     uri: Optional[str]
 53 |         The URI to assign the repository in RDF.
 54 |     repository: Repository
 55 |         The repository we are extracting metadata from.
 56 |     """
 57 | 
 58 |     url: str
 59 |     base_url: Optional[str] = None
 60 |     local_path: Optional[str] = None
 61 |     _cloned: bool = False
 62 | 
 63 |     def extract(self) -> Repository:
 64 |         # Assuming author is the first person to commit
 65 |         self.repository = self._repo_data
 66 | 
 67 |         repo_meta = dict(
 68 |             authors=[self._get_creator()],
 69 |             contributors=self._get_contributors(),
 70 |             date_created=self._get_creation_date(),
 71 |             date_modified=self._get_modification_date(),
 72 |             name=self.path,
 73 |             url=self.url,
 74 |         )
 75 | 
 76 |         return Repository(**repo_meta)  # type: ignore
 77 | 
 78 |     def list_files(self) -> List[LocalResource]:
 79 |         self.repository = self._repo_data
 80 |         file_list = []
 81 | 
 82 |         for path in Path(self.local_path).rglob("*"):  # type: ignore
 83 |             if (path.parts[0] == ".git") or not path.is_file():
 84 |                 continue
 85 |             file_list.append(LocalResource(path))
 86 | 
 87 |         return file_list
 88 | 
 89 |     def __del__(self):
 90 |         """Cleanup the cloned repo if it was cloned and is located in tempdir."""
 91 |         try:
 92 |             # Can't be too careful with temp files
 93 |             tempdir = tempfile.gettempdir()
 94 |             if (
 95 |                 self.local_path
 96 |                 and self._cloned
 97 |                 and self.local_path.startswith(tempdir)
 98 |                 and tempdir != os.getcwd()
 99 |             ):
100 |                 shutil.rmtree(self.local_path)
101 |         except AttributeError:
102 |             pass
103 | 
104 |     @cached_property
105 |     def _repo_data(self) -> pydriller.Repository:
106 |         """Get the repository data by accessing local data or cloning."""
107 |         if self.local_path is None:
108 |             self._cloned = True
109 |             self.local_path = tempfile.TemporaryDirectory().name
110 |             git.Repo.clone_from(self.url, self.local_path)  # type: ignore
111 |         return pydriller.Repository(self.local_path)
112 | 
113 |     def _get_contributors(self) -> List[Person]:
114 |         """Get the authors of the repository."""
115 |         authors = set()
116 |         for commit in self.repository.traverse_commits():
117 |             if commit.author is not None:
118 |                 authors.add((commit.author.name, commit.author.email))
119 |         return [self._dev_to_person(name, email) for name, email in authors]
120 | 
121 |     def _get_creation_date(self) -> Optional[datetime]:
122 |         """Get the creation date of the repository."""
123 |         try:
124 |             return next(self.repository.traverse_commits()).author_date
125 |         except StopIteration:
126 |             return None
127 | 
128 |     def _get_modification_date(self) -> Optional[datetime]:
129 |         """Get the last modification date of the repository."""
130 |         commit = None
131 |         try:
132 |             for commit in self.repository.traverse_commits():
133 |                 pass
134 |         except (StopIteration, NameError):
135 |             pass
136 |         finally:
137 |             return commit.author_date if commit else None
138 | 
139 |     def _get_creator(self) -> Optional[Person]:
140 |         """Get the creator of the repository."""
141 |         try:
142 |             creator = next(self.repository.traverse_commits()).author
143 |             return self._dev_to_person(creator.name, creator.email)
144 |         except StopIteration:
145 |             return None
146 | 
147 |     def _dev_to_person(
148 |         self, name: Optional[str], email: Optional[str]
149 |     ) -> Person:
150 |         """Convert a Developer object to a Person object."""
151 |         if name is None:
152 |             uid = str(uuid.uuid4())
153 |         else:
154 |             uid = name.replace(" ", "_").lower()
155 |         dev_id = f"{self.url}/{uid}"
156 |         return Person(
157 |             _id=dev_id,
158 |             identifier=uid,
159 |             name=name,
160 |             email=email,
161 |         )
162 | 


--------------------------------------------------------------------------------
/gimie/models.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """Data models to represent nodes in the graph generated by gimie."""
 18 | from __future__ import annotations
 19 | from dataclasses import dataclass, field
 20 | from datetime import datetime
 21 | import datetime
 22 | from typing import List, Optional, Union
 23 | 
 24 | from calamus.schema import JsonLDSchema
 25 | from calamus import fields
 26 | from rdflib import Graph
 27 | 
 28 | from gimie.graph.namespaces import SDO
 29 | 
 30 | 
 31 | @dataclass(order=True)
 32 | class Release:
 33 |     """
 34 |     This class represents a release of a repository.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     tag: str
 39 |         The tag of the release.
 40 |     date: datetime.datetime
 41 |         The date of the release.
 42 |     commit_hash: str
 43 |         The commit hash of the release.
 44 |     """
 45 | 
 46 |     tag: str = field(compare=False)
 47 |     date: datetime = field(compare=True)
 48 |     commit_hash: str = field(compare=False)
 49 | 
 50 | 
 51 | @dataclass
 52 | class Organization:
 53 |     """See http//schema.org/Organization"""
 54 | 
 55 |     _id: str
 56 |     name: str
 57 |     legal_name: Optional[str] = None
 58 |     email: Optional[List[str]] = None
 59 |     description: Optional[str] = None
 60 |     logo: Optional[str] = None
 61 | 
 62 | 
 63 | class OrganizationSchema(JsonLDSchema):
 64 |     _id = fields.Id()
 65 |     name = fields.String(SDO.name)
 66 |     legal_name = fields.String(SDO.legalName)
 67 |     email = fields.String(SDO.email)
 68 |     description = fields.String(SDO.description)
 69 |     logo = fields.IRI(SDO.logo)
 70 | 
 71 |     class Meta:
 72 |         rdf_type = SDO.Organization
 73 |         model = Organization
 74 | 
 75 | 
 76 | @dataclass
 77 | class Person:
 78 |     """See http//schema.org/Person"""
 79 | 
 80 |     _id: str
 81 |     identifier: str
 82 |     name: Optional[str] = None
 83 |     email: Optional[str] = None
 84 |     affiliations: Optional[List[Organization]] = None
 85 | 
 86 |     def __str__(self):
 87 |         name = f"({self.name}) " if self.name else ""
 88 |         email = f"<{self.email}> " if self.email else ""
 89 |         orgs = (
 90 |             f"[{', '.join([org.name for org in self.affiliations])}]"
 91 |             if self.affiliations
 92 |             else ""
 93 |         )
 94 |         return f"{self.identifier} {name}{email}{orgs}".strip(" ")
 95 | 
 96 | 
 97 | class PersonSchema(JsonLDSchema):
 98 |     _id = fields.Id()
 99 |     identifier = fields.String(SDO.identifier)
100 |     name = fields.String(SDO.name)
101 |     affiliations = fields.Nested(
102 |         SDO.affiliation, OrganizationSchema, many=True
103 |     )
104 | 
105 |     class Meta:
106 |         rdf_type = SDO.Person
107 |         model = Person
108 | 
109 | 
110 | @dataclass
111 | class Repository:
112 |     """This class represents a git repository.
113 |     It does not contain any information about the content of the repository.
114 |     See https://schema.org/SoftwareSourceCode
115 |     """
116 | 
117 |     url: str
118 |     name: str
119 | 
120 |     authors: Optional[List[Union[Organization, Person]]] = None
121 |     contributors: Optional[List[Person]] = None
122 |     date_created: Optional[datetime] = None
123 |     date_modified: Optional[datetime] = None
124 |     date_published: Optional[datetime] = None
125 |     description: Optional[str] = None
126 |     download_url: Optional[str] = None
127 |     identifier: Optional[str] = None
128 |     keywords: Optional[List[str]] = None
129 |     licenses: Optional[List[str]] = None
130 |     parent_repository: Optional[str] = None
131 |     prog_langs: Optional[List[str]] = None
132 |     version: Optional[str] = None
133 | 
134 |     @property
135 |     def _id(self) -> str:
136 |         """Unique identifier for the repository."""
137 |         return self.url
138 | 
139 |     def to_graph(self) -> Graph:
140 |         """Convert repository to RDF graph."""
141 |         jd = RepositorySchema().dumps(self)
142 |         g: Graph = Graph().parse(format="json-ld", data=str(jd))
143 |         g.bind("schema", SDO)
144 |         return g
145 | 
146 |     def serialize(self, format: str = "ttl", **kwargs) -> str:
147 |         """Serialize the RDF graph representing the instance."""
148 |         return self.to_graph().serialize(format=format, **kwargs)  # type: ignore
149 | 
150 |     def jsonld(self) -> str:
151 |         """Alias for jsonld serialization."""
152 |         return self.serialize(format="json-ld")
153 | 
154 | 
155 | class RepositorySchema(JsonLDSchema):
156 |     """This defines the schema used for json-ld serialization."""
157 | 
158 |     _id = fields.Id()
159 |     authors = fields.Nested(
160 |         SDO.author, [PersonSchema, OrganizationSchema], many=True
161 |     )
162 |     contributors = fields.Nested(SDO.contributor, PersonSchema, many=True)
163 |     date_created = fields.Date(SDO.dateCreated)
164 |     date_modified = fields.Date(SDO.dateModified)
165 |     date_published = fields.Date(SDO.datePublished)
166 |     description = fields.String(SDO.description)
167 |     download_url = fields.IRI(SDO.downloadUrl)
168 |     identifier = fields.String(SDO.identifier)
169 |     keywords = fields.List(SDO.keywords, fields.String)
170 |     licenses = fields.List(SDO.license, fields.IRI)
171 |     name = fields.String(SDO.name)
172 |     parent_repository = fields.IRI(SDO.isBasedOn)
173 |     prog_langs = fields.List(SDO.programmingLanguage, fields.String)
174 |     url = fields.IRI(SDO.codeRepository)
175 |     version = fields.String(SDO.version)
176 | 
177 |     class Meta:
178 |         rdf_type = SDO.SoftwareSourceCode
179 |         model = Repository
180 |         add_value_types = False
181 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | Notable changes introduced in gimie releases are documented in this file
  2 | 
  3 | 
  4 | ## [0.7.2] - 2024-12-18
  5 | 
  6 | ### Bug Fixes
  7 | 
  8 | - *(cff)* doi structure parsing (#121)
  9 | 
 10 | 
 11 | ## [0.7.1] - 2024-12-09
 12 | 
 13 | ### Bug Fixes
 14 | 
 15 | - *(dependency missing)* Added pyyaml (#119)
 16 | 
 17 | 
 18 | ## [0.7.0] - 2024-11-28
 19 | 
 20 | ### Bug Fixes
 21 | 
 22 | - *(cff)* enforce valid urls as doi (#108)- spelling mistake in run as library docs (#113)
 23 | 
 24 | ### Documentation
 25 | - update gimie API examples (#105)
 26 | - add CFF file (#111)
 27 | 
 28 | ### Features
 29 | 
 30 | - *(parser)* extract authors from CFF files (#115)- add parsers support (#97)
 31 | - cff to doi parser (#107)
 32 | 
 33 | 
 34 | ## [0.6.0] - 2023-10-19
 35 | 
 36 | ### Bug Fixes
 37 | 
 38 | - *(deps)* switch to scancode mini (#88)
 39 | - *(docker)* push action was missing buildx (#91)
 40 | - *(github)* replace superseded schema:isBasedOnUrl property (#80)- incorrect mapping for schema:codeRepository (#64)
 41 | - *(license)* NOASSERTION should not return triples. (#66)
 42 | 
 43 | ### Features
 44 | 
 45 | - *(conventional-PRs)* all PRs will need to follow conventional format
 46 | - *(conventional-PRs)* all PRs will need to follow conventional format
 47 | - *(github.py)* Get "forked from" property of a repository (#79)
 48 | - *(io)* file-like interface to remote resources (#70)- license matcher for git extractor (#78)
 49 | 
 50 | 
 51 | ## [0.5.1] - 2023-07-10
 52 | 
 53 | ### Bug Fixes
 54 | 
 55 | - incorrect mapping for schema:codeRepository (#64)
 56 | 
 57 | 
 58 | ## [0.5.0] - 2023-07-04
 59 | 
 60 | ### Bug Fixes
 61 | 
 62 | - *(gitlab)* extraction of author on user-owned projects (#57)
 63 | 
 64 | ### Documentation
 65 | 
 66 | - add docs website (#58)
 67 | 
 68 | ### Features
 69 | 
 70 | - *(gitlab)* support private instances (#62)
 71 | 
 72 | 
 73 | ## [0.4.0] - 2023-06-09
 74 | 
 75 | ### Bug Fixes
 76 | 
 77 | - *(docs)* execute Makefile rule with poetry
 78 | - *(gitlab)* edge case where no release available
 79 | - *(gitlab)* pass user node to _get_author instead of parent node
 80 | - *(gitlab)* rm debug breakpoint
 81 | - *(gitlab)* extraction of author on user-owned projects (#57)- gitlab download url
 82 | - prevent license finder from picking up docs files
 83 | 
 84 | ### Documentation
 85 | 
 86 | - *(api)* reduce autodoc ToC depth
 87 | - *(cli)* add and configure sphinx-click to work with typer
 88 | - *(deps)* introduce doc dependency group
 89 | - *(git)* rm duplicate attibute from docstring
 90 | - *(setup)* add sphinx configuration
 91 | - *(style)* add logo + favicon
 92 | - *(style)* add logo to front page
 93 | - *(theme)* furo -> sphinxawesome
 94 | - *(theme)* add sphinx_design extension, downgrade to sphinx6 for compat
 95 | - *(tokens)* Add tutorial for encrypted tokens
 96 | - *(tokens)* fix windows instructions- add Makefile rule to generate sphinx website
 97 | - initial sphinx website with apidoc
 98 | - add apidoc output to gitignore
 99 | - add intro pages
100 | - improve header names
101 | - add quickstart section, enable tabbing and crossref
102 | - add sphinx-tabs as doc dep
103 | - add sphinx-copybutton extension
104 | - add changelog and configure git-cliff
105 | - replace deprecated commonmark parser with myst
106 | - enable placeholder highlighting extension
107 | - improve index format
108 | - add windows variant for env var
109 | - add docs website (#58)
110 | - update readme and add docs badge
111 | 
112 | ### Features
113 | 
114 | - *(gitlab)* fallback to rest api if author missing from graphql. make type hints py38 compat.
115 | - *(io)* Allow rdflib kwargs in serialize()- use GraphQL API in gh extractor (#33)
116 | - Git extractor (#42)
117 | - disallow local paths (#46)
118 | 
119 | 
120 | ## [0.3.0] - 2023-02-24
121 | 
122 | ### Bug Fixes
123 | 
124 | - exclude hidden files from license search
125 | - correctly handle one or multiple license paths
126 | - temporarily disable scancode (#19)
127 | - rename GITHUB_TOKEN to ACCESS_TOKEN
128 | - change token back to ACCESS_TOKEN since GITHUB_TOKEN failed
129 | - GITHUB_TOKEN must be prefixed with github as environment variable
130 | - set test workflow back to using ACCESS_TOKEN as a repo secret
131 | - add .dockerignore, copy necessary files only and improve comments
132 | - rename container-publish.yml into docker-publish.yml
133 | - 'building docker image' instead of 'building docker container'
134 | 
135 | ### Documentation
136 | 
137 | - define initial contributing guidelines
138 | - add usage examples in README
139 | - update copyright notice in license
140 | - specify type hints and rm unused imports in LicenseMetadata
141 | - add dev status in readme
142 | - document the release process in the readme
143 | - readme badges (#25)
144 | - add section to the readme on how to provide a github token
145 | - adapt documentation to usage of ACCESS_TOKEN instead of GITHUB_TOKEN
146 | - adapt readme to installation with makefile
147 | - give options to install either PyPI or dev version of gimie
148 | - add message for docker-build Makefile rule
149 | - add image annotations to dockerfile
150 | - add docker instructions in readme
151 | 
152 | ### Features
153 | 
154 | - *(cli)* add CLI skeleton (#9)- initial project definition with pyproject.toml
155 | - add placeholder folders
156 | - add placeholder tests
157 | - add basic repo class and placeholder source interfaces
158 | - add console entrypoint definition in pyproject.toml
159 | - add GitMetadata methods to get commit authors and repository creation date
160 | - add method to get releases date and commit hash
161 | - sort releases by date
162 | - add method to get git repo creator
163 | - add unit tests for git source
164 | - Created a license finder using scancode toolkit
165 | - Added triple serialization of license result (spdx url)
166 | - use cached property from functools
167 | - added a make_graph script. Now only contains add_license_to_graph().
168 | - Created software class, and make graph functions, black reformat
169 | - add license scanner (#12)
170 | - add prototype for RDF graph serialization (#15)
171 | - initial architecture with GithubExtractor (#23)
172 | - add python-dotenv to dependecies
173 | - pick up github token from the environment variables
174 | - add `.env.dist` file as an example for a `.env` file
175 | - provide option to provide github_token when calling extractor
176 | - add pre-commit to dependencies
177 | - add makefile to make installation easier
178 | - add Dockerfile and entrypoint.sh
179 | - add Makefile rule to build the docker image
180 | - add github workflow to push image to github container registry
181 | 
182 | 
183 | <!--generated by git-cliff -->
184 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![gimie](docs/logo.svg)](https://github.com/sdsc-ordes/gimie)
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/gimie.svg)](https://badge.fury.io/py/gimie) [![Python Poetry Test](https://github.com/sdsc-ordes/gimie/actions/workflows/poetry-pytest.yml/badge.svg)](https://github.com/sdsc-ordes/gimie/actions/workflows/poetry-pytest.yml) [![docs](https://github.com/sdsc-ordes/gimie/actions/workflows/sphinx-docs.yml/badge.svg)](https://sdsc-ordes.github.io/gimie) [![Coverage Status](https://coveralls.io/repos/github/sdsc-ordes/gimie/badge.svg?branch=main)](https://coveralls.io/github/sdsc-ordes/gimie?branch=main)
  4 | 
  5 | Gimie (GIt Meta Information Extractor) is a python library and command line tool to extract structured metadata from git repositories.
  6 | 
  7 | 
  8 | ## Context
  9 | Scientific code repositories contain valuable metadata which can be used to enrich existing catalogues, platforms or databases. This tool aims to easily extract structured metadata from a generic git repositories. It can extract extract metadata from the Git provider (GitHub or GitLab) or from the git index itself.
 10 | 
 11 | ----------------------------------------------------------------------
 12 | 
 13 | Using Gimie: easy peasy, it's a 3 step process.
 14 | 
 15 | ## 1: Installation
 16 | 
 17 | To install the stable version on PyPI:
 18 | 
 19 | ```shell
 20 | pip install gimie
 21 | ```
 22 | 
 23 | To install the dev version from github:
 24 | 
 25 | ```shell
 26 | pip install git+https://github.com/sdsc-ordes/gimie.git@main#egg=gimie
 27 | ```
 28 | 
 29 | Gimie is also available as a docker container hosted on the [Github container registry](https://github.com/sdsc-ordes/gimie/pkgs/container/gimie):
 30 | 
 31 | ```shell
 32 | docker pull ghcr.io/sdsc-ordes/gimie:latest
 33 | 
 34 | # The access token can be provided as an environment variable
 35 | docker run -e GITHUB_TOKEN=$GITHUB_TOKEN ghcr.io/sdsc-ordes/gimie:latest gimie data <repo>
 36 | ```
 37 | 
 38 | ## 2 : Set your credentials
 39 | 
 40 | In order to access the github api, you need to provide a github token with the `read:org` scope.
 41 | 
 42 | ### A. Create access tokens
 43 | 
 44 | New to access tokens? Or don't know how to get your Github / Gitlab token ?
 45 | 
 46 | Have no fear, see
 47 | [here for Github tokens](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) and [here for Gitlab tokens](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html).
 48 | (Note: tokens are as precious as passwords! Treat them as such.)
 49 | 
 50 | ### B. Set your access tokens via the Terminal
 51 | 
 52 | Gimie will use your access tokens to gather information for you. If you want info about a Github repo, Gimie needs your Github token; if you want info about a Gitlab Project then Gimie needs your Gitlab token.
 53 | 
 54 | Add your tokens one by one in your terminal:
 55 | your Github token:
 56 | ```bash
 57 | export GITHUB_TOKEN=
 58 | ```
 59 | and/or your Gitlab token:
 60 | ```bash
 61 | export GITLAB_TOKEN=
 62 | ```
 63 | 
 64 | ## 3: GIMIE info ! Run Gimie
 65 | 
 66 | ### As a command line tool
 67 | 
 68 | ```shell
 69 | gimie data https://github.com/numpy/numpy
 70 | ```
 71 | (want a Gitlab project instead? Just replace the URL in the command line)
 72 | 
 73 | ### As a python library
 74 | 
 75 | ```python
 76 | from gimie.project import Project
 77 | proj = Project("https://github.com/numpy/numpy")
 78 | 
 79 | # To retrieve the rdflib.Graph object
 80 | g = proj.extract()
 81 | 
 82 | # To retrieve the serialized graph
 83 | g_in_ttl = g.serialize(format='ttl')
 84 | print(g_in_ttl)
 85 | ```
 86 | For more advanced use see [the documentation](https://sdsc-ordes.github.io/gimie/intro/usage_python.html).
 87 | ## Outputs
 88 | 
 89 | The default output is [Turtle](https://www.w3.org/TR/turtle/), a textual syntax for [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) data model. We follow the schema recommended by [codemeta](https://codemeta.github.io/).
 90 | Supported formats are turtle, json-ld and n-triples (by specifying the `--format` argument in your call i.e. `gimie data https://github.com/numpy/numpy --format 'ttl'`).
 91 | 
 92 | With no specifications, Gimie will print results in the terminal. Want to save Gimie output to a file? Add your file path to the end : `gimie data https://github.com/numpy/numpy > path_to_output/gimie_output.ttl`
 93 | 
 94 | ----------------------------------------------------------------------
 95 | 
 96 | ## Contributing
 97 | 
 98 | All contributions are welcome. New functions and classes should have associated tests and docstrings following the [numpy style guide](https://numpydoc.readthedocs.io/en/latest/format.html).
 99 | 
100 | The code formatting standard we use is [black](https://github.com/psf/black), with `--line-length=79` to follow [PEP8](https://peps.python.org/pep-0008/) recommendations. We use [pytest](https://docs.pytest.org/en/7.2.x/) as our testing framework. This project uses [pyproject.toml](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/) to define package information, requirements and tooling configuration.
101 | 
102 | ### For development:
103 | 
104 | activate a conda or virtual environment with Python 3.8 or higher
105 | 
106 | ```shell
107 | git clone https://github.com/sdsc-ordes/gimie && cd gimie
108 | make install
109 | ```
110 | 
111 | run tests:
112 | 
113 | ```shell
114 | make test
115 | ```
116 | 
117 | run checks:
118 | 
119 | ```shell
120 | make check
121 | ```
122 | for an easier use Github/Gitlab APIs, place your access tokens in the `.env` file: (and don't worry, the `.gitignore` will ignore them when you push to GitHub)
123 | 
124 | ```
125 | cp .env.dist .env
126 | ```
127 | 
128 | build documentation:
129 | 
130 | ```shell
131 | make doc
132 | ```
133 | 
134 | ## Releases and Publishing on Pypi
135 | 
136 | Releases are done via github release
137 | 
138 | - a release will trigger a github workflow to publish the package on Pypi
139 | - Make sure to update to a new version in `pyproject.toml` and `conf.py` before making the release
140 | - It is possible to test the publishing on Pypi.test by running a manual workflow: go to github actions and run the Workflow: 'Publish on Pypi Test'
141 | 
142 | ## Copyright
143 | Copyright © 2024-2025 Swiss Data Science Center (SDSC),[www.datascience.ch](http://www.datascience.ch/), ROR: [ror.org/02hdt9m26](https://ror.org/02hdt9m26). All rights reserved. The SDSC is a Swiss National Research Infrastructure, jointly established and legally represented by the École Polytechnique Fédérale de Lausanne (EPFL) and the Eidgenössische Technische Hochschule Zürich (ETH Zürich) as a société simple. This copyright encompasses all materials, software, documentation, and other content created and developed by the SDSC.
144 | 


--------------------------------------------------------------------------------
/gimie/parsers/cff.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | from io import BytesIO
 18 | import re
 19 | from typing import List, Optional, Set
 20 | import yaml
 21 | from rdflib.term import URIRef
 22 | from rdflib import Graph, URIRef, Literal
 23 | from rdflib.namespace import RDF
 24 | from gimie import logger
 25 | from gimie.graph.namespaces import SDO, MD4I
 26 | from gimie.parsers.abstract import Parser
 27 | from gimie.utils.uri import is_valid_orcid, extract_doi_match
 28 | 
 29 | 
 30 | class CffParser(Parser):
 31 |     """Parse DOI and authors from CITATION.cff."""
 32 | 
 33 |     def __init__(self, subject: str):
 34 |         super().__init__(subject)
 35 | 
 36 |     def parse(self, data: bytes) -> Graph:
 37 |         """Extracts DOIs and list of authors from a CFF file and returns a
 38 |         graph with triples <subject> <schema:citation> <doi>
 39 |         and a number of author objects with <schema:name> and <md4i:orcid> values.
 40 |         If no DOIs are found, they will not be included in the graph.
 41 |         If no authors are found, they will not be included in the graph.
 42 |         If neither authors nor DOIs are found, an empty graph is returned.
 43 |         """
 44 |         extracted_cff_triples = Graph()
 45 |         dois = get_cff_doi(data)
 46 |         authors = get_cff_authors(data)
 47 | 
 48 |         if dois:
 49 |             for doi in dois:
 50 |                 extracted_cff_triples.add(
 51 |                     (self.subject, SDO.citation, URIRef(doi))
 52 |                 )
 53 |         if not authors:
 54 |             return extracted_cff_triples
 55 |         for author in authors:
 56 |             if is_valid_orcid(author["orcid"]):
 57 |                 orcid = URIRef(author["orcid"])
 58 |                 extracted_cff_triples.add(
 59 |                     (self.subject, SDO.author, URIRef(orcid))
 60 |                 )
 61 |                 extracted_cff_triples.add(
 62 |                     (
 63 |                         URIRef(orcid),
 64 |                         SDO.name,
 65 |                         Literal(
 66 |                             author["given-names"]
 67 |                             + " "
 68 |                             + author["family-names"]
 69 |                         ),
 70 |                     )
 71 |                 )
 72 |                 extracted_cff_triples.add(
 73 |                     (
 74 |                         orcid,
 75 |                         MD4I.orcidId,
 76 |                         Literal(orcid),
 77 |                     )
 78 |                 )
 79 |                 extracted_cff_triples.add(
 80 |                     (
 81 |                         orcid,
 82 |                         SDO.affiliation,
 83 |                         Literal(author["affiliation"]),
 84 |                     )
 85 |                 )
 86 |                 extracted_cff_triples.add((orcid, RDF.type, SDO.Person))
 87 |         return extracted_cff_triples
 88 | 
 89 | 
 90 | def doi_to_url(doi: str) -> str:
 91 |     """Formats a doi to an https URL to doi.org.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     doi
 96 |         doi where the scheme (e.g. https://) and
 97 |         hostname (e.g. doi.org) may be missing.
 98 | 
 99 |     Returns
100 |     -------
101 |     str
102 |         doi formatted as a valid url. Base url
103 |         is set to https://doi.org when missing.
104 | 
105 |     Examples
106 |     --------
107 |     >>> doi_to_url("10.0000/example.abcd")
108 |     'https://doi.org/10.0000/example.abcd'
109 |     >>> doi_to_url("doi.org/10.0000/example.abcd")
110 |     'https://doi.org/10.0000/example.abcd'
111 |     >>> doi_to_url("https://doi.org/10.0000/example.abcd")
112 |     'https://doi.org/10.0000/example.abcd'
113 |     """
114 | 
115 |     doi_match = extract_doi_match(doi)
116 | 
117 |     if doi_match is None:
118 |         raise ValueError(f"Not a valid DOI: {doi}")
119 | 
120 |     return f"https://doi.org/{doi_match}"
121 | 
122 | 
123 | def get_cff_doi(data: bytes) -> Optional[list[str]]:
124 |     """Given a CFF file, returns a list of DOIs, if any.
125 | 
126 |     Parameters
127 |     ----------
128 |     data
129 |         The cff file body as bytes.
130 | 
131 |     Returns
132 |     -------
133 |     list of str, optional
134 |         DOIs formatted as valid URLs
135 | 
136 |     Examples
137 |     --------
138 |     >>> get_cff_doi(bytes("identifiers:\\n    - type: doi\\n      value: 10.5281/zenodo.1234\\n    - type: doi\\n      value: 10.5281/zenodo.5678", encoding="utf8"))
139 |     ['https://doi.org/10.5281/zenodo.1234', 'https://doi.org/10.5281/zenodo.5678']
140 |     >>> get_cff_doi(bytes("identifiers:\\n    - type: doi\\n      value: 10.5281/zenodo.9012", encoding="utf8"))
141 |     ['https://doi.org/10.5281/zenodo.9012']
142 |     >>> get_cff_doi(bytes("abc: def", encoding="utf8"))
143 |     """
144 | 
145 |     try:
146 |         cff = yaml.safe_load(data.decode())
147 |     except yaml.scanner.ScannerError:
148 |         logger.warning("cannot read CITATION.cff, skipped.")
149 |         return None
150 | 
151 |     doi_urls = []
152 | 
153 |     try:
154 |         identifiers = cff["identifiers"]
155 |     except (KeyError, TypeError):
156 |         logger.warning(
157 |             "CITATION.cff does not contain a valid 'identifiers' key."
158 |         )
159 |         return None
160 | 
161 |     for identifier in identifiers:
162 |         if identifier.get("type") == "doi":
163 |             try:
164 |                 doi_url = doi_to_url(identifier["value"])
165 |                 doi_urls.append(doi_url)
166 |             except ValueError as err:
167 |                 logger.warning(err)
168 | 
169 |     return doi_urls or None
170 | 
171 | 
172 | def get_cff_authors(data: bytes) -> Optional[List[dict[str, str]]]:
173 |     """Given a CFF file, returns a list of dictionaries containing orcid, affiliation, first and last names of authors, if any.
174 | 
175 |     Parameters
176 |     ----------
177 |     data
178 |         The cff file body as bytes.
179 | 
180 |     Returns
181 |     -------
182 |     list(dict), optional
183 |         orcid, names strings of authors
184 | 
185 |     """
186 | 
187 |     try:
188 |         cff = yaml.safe_load(data.decode())
189 |     except yaml.scanner.ScannerError:
190 |         logger.warning("cannot read CITATION.cff, skipped.")
191 |         return None
192 | 
193 |     authors = []
194 |     try:
195 |         for author in cff["authors"]:
196 |             author_dict = {
197 |                 "family-names": author.get("family-names", ""),
198 |                 "given-names": author.get("given-names", ""),
199 |                 "orcid": author.get("orcid", ""),
200 |                 "affiliation": author.get("affiliation", ""),
201 |             }
202 |             authors.append(author_dict)
203 |     except KeyError:
204 |         logger.warning("CITATION.cff does not contain an 'authors' key.")
205 |         return None
206 | 
207 |     return authors if authors else None
208 | 


--------------------------------------------------------------------------------
/gimie/utils/text_processing.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from functools import reduce
  3 | import re
  4 | from typing import (
  5 |     Dict,
  6 |     Iterable,
  7 |     List,
  8 |     Literal,
  9 |     Optional,
 10 |     Tuple,
 11 | )
 12 | 
 13 | import numpy as np
 14 | from pydantic import BaseModel, Field
 15 | from pydantic.dataclasses import dataclass
 16 | import scipy.sparse as sp
 17 | 
 18 | 
 19 | def tokenize(text: str, sep: str = " ") -> List[str]:
 20 |     """Basic tokenizer. Removes punctuation, but not stop words.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     text:
 25 |         Text to tokenize.
 26 |     sep:
 27 |         Token separator.
 28 | 
 29 |     Examples
 30 |     --------
 31 |     >>> tokenize("Is this a test? Yes it is.")
 32 |     ['is', 'this', 'a', 'test', 'yes', 'it', 'is']
 33 |     """
 34 |     text = text.lower()
 35 |     text = re.sub(r"[\.|,|;|:|!|?|\n]", "", text)
 36 |     return text.split(sep)
 37 | 
 38 | 
 39 | def extract_ngrams(tokens: List[str], size: int = 1) -> List[str]:
 40 |     """Extract ngrams from a list of tokens.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     tokens:
 45 |         List of tokens.
 46 |     size:
 47 |         Size of ngrams to extract.
 48 | 
 49 |     Examples
 50 |     --------
 51 |     >>> extract_ngrams(["this", "is", "a", "test"], size=2)
 52 |     ['this is', 'is a', 'a test']
 53 |     """
 54 |     return [
 55 |         " ".join(tokens[i : i + size])
 56 |         for i in range(0, len(tokens) - size + 1)
 57 |     ]
 58 | 
 59 | 
 60 | def get_ngram_counts(
 61 |     doc: str, ngram_range: Tuple[int, int] = (1, 1)
 62 | ) -> Counter[str]:
 63 |     """Get ngram counts for a document. The ngram range is inclusive.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     doc:
 68 |         Document to extract ngrams from.
 69 |     ngram_range:
 70 |         Inclusive range of ngram sizes to extract.
 71 | 
 72 |     Examples
 73 |     --------
 74 |     >>> get_ngram_counts("Red roses red.", ngram_range=(1, 2))
 75 |     Counter({'red': 2, 'roses': 1, 'red roses': 1, 'roses red': 1})
 76 |     """
 77 |     ngram_counts: Counter[str] = Counter()
 78 |     tokens = tokenize(doc)
 79 |     for size in range(ngram_range[0], ngram_range[1] + 1):
 80 |         ngram_counts += Counter(extract_ngrams(tokens, size))
 81 |     return ngram_counts
 82 | 
 83 | 
 84 | def normalize_csr_rows(X: sp.csr_matrix, norm: str = "l1") -> sp.csr_matrix:
 85 |     """Normalize rows of a CSR matrix in place.
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     X:
 90 |         CSR matrix to normalize.
 91 |     norm:
 92 |         Norm to use for normalization. Either "l1" or "l2".
 93 | 
 94 |     Examples
 95 |     --------
 96 |     >>> X = sp.csr_matrix([[1, 2], [3, 4]], dtype=np.float64)
 97 |     >>> normalize_csr_rows(X, norm="l1").toarray()
 98 |     array([[0.33333333, 0.66666667],
 99 |            [0.42857143, 0.57142857]])
100 |     >>> normalize_csr_rows(X, norm="l2").toarray()
101 |     array([[0.4472136 , 0.89442719],
102 |            [0.6       , 0.8       ]])
103 |     """
104 |     norm_func = {
105 |         "l1": lambda x: np.abs(x).sum(),
106 |         "l2": lambda x: np.sqrt((x**2).sum()),
107 |     }[norm]
108 | 
109 |     for i in range(X.shape[0]):
110 |         if X[i].sum() == 0.0:
111 |             continue
112 | 
113 |         X[i, :] /= norm_func(X[i].data)
114 |     return X
115 | 
116 | 
117 | @dataclass
118 | class TfidfConfig:
119 |     """Configuration for TfidfVectorizer.
120 | 
121 |     For more information on tf-idf, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
122 | 
123 |     Parameters
124 |     ----------
125 |     max_features:
126 |         Maximum number of features to keep. If None, all features are kept.
127 |     ngram_range:
128 |         Inclusive range of ngram sizes to extract.
129 |     smooth_idf:
130 |         Smooth idf weights by adding a constant 1 to the numerator and denominator
131 |         of the idf as if an extra document was seen containing every term once,
132 |         preventing zero divisions.
133 |     vocabulary:
134 |         Vocabulary to use. If None, the vocabulary is inferred from the data.
135 |     norm:
136 |         Normalization to use for the tfidf matrix. Either "l1" or "l2".
137 |     sublinear_tf:
138 |         Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
139 |     """
140 | 
141 |     max_features: Optional[int] = None
142 |     ngram_range: Tuple[int, int] = (1, 1)
143 |     smooth_idf: bool = True
144 |     vocabulary: Optional[Dict[str, int]] = None
145 |     norm: Optional[Literal["l1", "l2"]] = None
146 |     sublinear_tf: bool = False
147 | 
148 | 
149 | class TfidfVectorizer(BaseModel):
150 |     r"""A simple term frequency-inverse document frequency (tf-idf) vectorizer
151 |     that can be loaded from and serialized to JSON.
152 | 
153 |     This implementation replicates the behavior of scikit-learn's (as of 1.3.2),
154 |     but only supports a subset of its parameters.
155 | 
156 |     For more information on tf-idf, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
157 | 
158 |     Parameters
159 |     ----------
160 |     config:
161 |         Configuration for the vectorizer.
162 |     idf_vector:
163 |         Precomputed idf vector. If None, it is computed from the data.
164 |     vocabulary:
165 |         Vocabulary to use. If None, the vocabulary is inferred from the data.
166 | 
167 |     Examples
168 |     --------
169 |     >>> docs = ["The quick brown fox", "jumps over", "the lazy dog."]
170 |     >>> vectorizer = TfidfVectorizer(config=TfidfConfig())
171 |     >>> tfidf = vectorizer.fit_transform(docs)
172 |     >>> tfidf.shape
173 |     (3, 8)
174 |     """
175 | 
176 |     config: TfidfConfig
177 |     idf_vector: List[float] = list()
178 |     vocabulary: Dict[str, int] = Field(default_factory=dict)
179 | 
180 |     def _get_idf_vector(
181 |         self, ngram_counts: List[Counter[str]], vocab: Dict[str, int]
182 |     ) -> List[float]:
183 |         """Compute the idf vector for the whole corpus from a list of
184 |         ngram counts from each document.
185 | 
186 |         Parameters
187 |         ----------
188 |         ngram_counts:
189 |             List of ngram counts for each document.
190 |         vocab:
191 |             Vocabulary to use. Each ngram key has an integer value used as the
192 |             column index of the output matrix.
193 |         """
194 |         idf_vector = np.zeros(len(vocab), dtype=np.float64)
195 |         for record in ngram_counts:
196 |             idf_vector[[vocab[t] for t in record.keys() if t in vocab]] += 1
197 |         n_docs = len(ngram_counts) + int(self.config.smooth_idf)
198 |         idf_vector += int(self.config.smooth_idf)
199 |         idf_vector = 1 + np.log(n_docs / (idf_vector))
200 |         return list(idf_vector)
201 | 
202 |     def _get_tf_matrix(
203 |         self, ngram_counts: List[Counter[str]], vocab: Dict[str, int]
204 |     ) -> sp.csr_matrix:
205 |         """Compute the term frequency matrix for the whole corpus from a
206 |         list of ngram counts from each document.
207 | 
208 |         Parameters
209 |         ----------
210 |         ngram_counts:
211 |             List of ngram counts for each document (rows of the output matrix).
212 |         vocab:
213 |             Vocabulary to use. Each ngram key has an integer value used as the
214 |             column index of the output matrix.
215 |         """
216 |         tf_matrix = sp.lil_matrix(
217 |             (len(ngram_counts), len(vocab)), dtype=np.float64
218 |         )
219 |         for idx, record in enumerate(ngram_counts):
220 |             pairs = record.items()
221 |             counts = [v for _, v in pairs]
222 |             tf_matrix[idx, [vocab[t] for t, _ in pairs]] = [c for c in counts]
223 |         tf_matrix = tf_matrix.tocsr()
224 |         if self.config.sublinear_tf:
225 |             # applies log in place
226 |             np.log(tf_matrix.data, tf_matrix.data)  # type: ignore
227 |             tf_matrix.data += 1  # type: ignore
228 |         return tf_matrix
229 | 
230 |     def _get_tfidf(
231 |         self, ngram_counts: List[Counter[str]], vocab: Dict[str, int]
232 |     ) -> sp.csr_matrix:
233 |         """Compute the tfidf matrix over the whole corpus from a list of
234 |         ngram counts from each document.
235 | 
236 |         Parameters
237 |         ----------
238 |         ngram_counts:
239 |             List of ngram counts for each document.
240 |         vocab:
241 |             Vocabulary to use. Each ngram key has an integer value used as the
242 |             column index of the output matrix.
243 |         """
244 |         tf_matrix: sp.csr_matrix = self._get_tf_matrix(
245 |             ngram_counts, vocab=vocab
246 |         )
247 | 
248 |         tfidf_matrix = tf_matrix.multiply(np.array(self.idf_vector))  # type: ignore
249 |         return tfidf_matrix.tocsr()  # type: ignore
250 | 
251 |     def _get_vocabulary(
252 |         self, ngram_counts: Iterable[Counter[str]]
253 |     ) -> dict[str, int]:
254 |         """Get the vocabulary from a list of ngram counts. The vocabulary
255 |         is a mapping from ngrams to integer used as column indices in the
256 |         tfidf matrix.
257 | 
258 |         Parameters
259 |         ----------
260 |         ngram_counts:
261 |             List of ngram counts for each document.
262 |         """
263 |         counts_corpus = reduce(lambda x, y: x | y, ngram_counts).most_common()
264 |         if self.config.max_features is not None:
265 |             counts_corpus = counts_corpus[: self.config.max_features]
266 |         return {
267 |             t[0]: i
268 |             for i, t in enumerate(sorted(counts_corpus, key=lambda x: x[0]))
269 |         }
270 | 
271 |     def fit(self, data: Iterable[str]):
272 |         """Fit the vectorizer to a list of documents.
273 | 
274 |         Parameters
275 |         ----------
276 |         data:
277 |             List of documents contents to fit the vectorizer to."""
278 |         counts_records: List[Counter[str]] = [
279 |             get_ngram_counts(doc, self.config.ngram_range) for doc in data
280 |         ]
281 |         vocab = self.config.vocabulary or self._get_vocabulary(counts_records)
282 |         self.idf_vector = self._get_idf_vector(counts_records, vocab=vocab)
283 |         self.vocabulary = vocab
284 | 
285 |     def transform(self, data: Iterable[str]) -> sp.csr_matrix:
286 |         """Transform a list of documents into a tfidf matrix.
287 |         The model must be fit before calling this method.
288 | 
289 |         Parameters
290 |         ----------
291 |         data:
292 |             List of documents contents to transform.
293 |         """
294 |         if not self.vocabulary:
295 |             raise ValueError("Vocabulary is empty. Call `fit` first.")
296 |         counts_records = [
297 |             get_ngram_counts(doc, self.config.ngram_range) for doc in data
298 |         ]
299 |         counts_records = [
300 |             Counter({k: v for k, v in doc.items() if k in self.vocabulary})
301 |             for doc in counts_records
302 |         ]
303 |         tfidf = self._get_tfidf(counts_records, vocab=self.vocabulary)
304 |         if self.config.norm is not None:
305 |             return normalize_csr_rows(tfidf, norm=self.config.norm)
306 |         return tfidf
307 | 
308 |     def fit_transform(self, data: Iterable[str]) -> sp.csr_matrix:
309 |         """Fit the vectorizer to a list of documents and transform them
310 |         into a tfidf matrix.
311 | 
312 |         Parameters
313 |         ----------
314 |         data:
315 |             List of documents contents to fit the vectorizer to and transform.
316 |         """
317 |         self.fit(list(data))
318 |         return self.transform(data)
319 | 


--------------------------------------------------------------------------------
/gimie/extractors/gitlab.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | from __future__ import annotations
 18 | from dataclasses import dataclass
 19 | import os
 20 | import requests
 21 | from datetime import datetime
 22 | from dateutil.parser import isoparse
 23 | from functools import cached_property
 24 | from typing import Any, Dict, List, Optional, Union
 25 | from urllib.parse import urlparse
 26 | from dotenv import load_dotenv
 27 | from gimie.io import RemoteResource
 28 | from gimie.models import (
 29 |     Organization,
 30 |     Person,
 31 |     Repository,
 32 | )
 33 | from gimie.extractors.abstract import Extractor
 34 | from gimie.extractors.common.queries import send_graphql_query, send_rest_query
 35 | 
 36 | load_dotenv()
 37 | 
 38 | 
 39 | @dataclass
 40 | class GitlabExtractor(Extractor):
 41 |     """Extractor for Gitlab repositories. Uses the Gitlab GraphQL API to
 42 |     extract metadata into linked data.
 43 |     url: str
 44 |         The url of the git repository.
 45 |     base_url: Optional[str]
 46 |         The base url of the git remote.
 47 | 
 48 |     """
 49 | 
 50 |     url: str
 51 |     base_url: Optional[str] = None
 52 |     local_path: Optional[str] = None
 53 | 
 54 |     token: Optional[str] = None
 55 | 
 56 |     def list_files(self) -> List[RemoteResource]:
 57 |         """takes the root repository folder and returns the list of files present"""
 58 |         file_list = []
 59 |         file_dict = self._repo_data["repository"]["tree"]["blobs"]["nodes"]
 60 |         defaultbranchref = self._repo_data["repository"]["rootRef"]
 61 |         for item in file_dict:
 62 |             file = RemoteResource(
 63 |                 path=item["name"],
 64 |                 url=f'{self.url}/-/raw/{defaultbranchref}/{item["name"]}',
 65 |                 headers=self._headers,
 66 |             )
 67 |             file_list.append(file)
 68 |         return file_list
 69 | 
 70 |     def extract(self) -> Repository:
 71 |         """Extract metadata from target Gitlab repository."""
 72 | 
 73 |         # fetch metadata
 74 |         data = self._repo_data
 75 | 
 76 |         # NOTE(identifier): Each Gitlab project has a unique identifier (integer)
 77 |         # NOTE(author): Fetches only the group directly related to the project
 78 |         # the group takes the form: parent/subgroup
 79 | 
 80 |         # NOTE(contributors): contributors = project members
 81 |         # who are not owners + those that have written merge requests
 82 |         # owners are either multiple individuals or a group. If no user
 83 |         # is marked as owner, contributors are project members or merge
 84 |         # request authors
 85 |         repo_meta = dict(
 86 |             authors=self._safe_extract_author(data),
 87 |             contributors=self._safe_extract_contributors(data),
 88 |             date_created=isoparse(data["createdAt"][:-1]),
 89 |             date_modified=isoparse(data["lastActivityAt"][:-1]),
 90 |             description=data["description"],
 91 |             identifier=urlparse(data["id"]).path.split("/")[2],
 92 |             keywords=data["topics"],
 93 |             name=self.path,
 94 |             prog_langs=[lang["name"] for lang in data["languages"]],
 95 |             url=self.url,
 96 |         )
 97 | 
 98 |         if data["releases"]["edges"]:
 99 |             repo_meta["date_published"] = isoparse(
100 |                 data["releases"]["edges"][0]["node"]["releasedAt"]
101 |             )
102 | 
103 |         if data["releases"] and (len(data["releases"]["edges"]) > 0):
104 |             # go into releases and take the name from the first node (most recent)
105 |             version = data["releases"]["edges"][0]["node"]["name"]
106 |             repo_meta["version"] = version
107 |             repo_meta[
108 |                 "download_url"
109 |             ] = f"{self.url}/-/archive/{version}/{self.path.split('/')[-1]}-{version}.tar.gz"
110 |         return Repository(**repo_meta)  # type: ignore
111 | 
112 |     def _safe_extract_author(
113 |         self, repo: Dict[str, Any]
114 |     ) -> List[Union[Person, Organization]]:
115 |         """Extract the author from a GraphQL repository node.
116 |         projectMembers is used if available, otherwise the author
117 |         is inferred from the project url."""
118 |         members = repo["projectMembers"]["edges"]
119 |         if len(members) > 0:
120 |             owners = filter(
121 |                 lambda m: m["node"]["accessLevel"]["stringValue"] == "OWNER",
122 |                 members,
123 |             )
124 |             return [
125 |                 self._get_author(owner["node"]["user"]) for owner in owners
126 |             ]
127 | 
128 |         if repo["group"] is not None:
129 |             return [self._get_author(repo["group"])]
130 | 
131 |         # If the author is absent from the GraphQL response (permission bug),
132 |         # fallback to the REST API
133 |         return [self._user_from_rest(self.path.split("/")[0])]
134 | 
135 |     def _safe_extract_contributors(
136 |         self, repo: dict[str, Any]
137 |     ) -> List[Person] | None:
138 |         members = [
139 |             user["node"]["user"]
140 |             for user in repo["projectMembers"]["edges"]
141 |             if user["node"]["accessLevel"]["stringValue"] != "OWNER"
142 |         ]
143 |         merge_request_authors = [
144 |             author["node"]["author"]
145 |             for author in repo["mergeRequests"]["edges"]
146 |         ]
147 |         contributors = members + merge_request_authors
148 |         # Drop duplicate (unhashable) dicts by "id" key
149 |         uniq_contrib = list({c["id"]: c for c in contributors}.values())
150 |         return [self._get_user(contrib) for contrib in uniq_contrib]
151 | 
152 |     @cached_property
153 |     def _repo_data(self) -> Dict[str, Any]:
154 |         """Fetch repository metadata from GraphQL endpoint."""
155 |         data = {"path": self.path}
156 |         project_query = """
157 |         query project_query($path: ID!) {
158 |             project(fullPath: $path) {
159 |                 name
160 |                 id
161 |                 description
162 |                 createdAt
163 |                 lastActivityAt
164 |                 group {
165 |                     id
166 |                     name
167 |                     description
168 |                     avatarUrl
169 |                     webUrl
170 |                 }
171 |                 languages {
172 |                     name
173 |                     share
174 |                 }
175 |                 topics
176 |                 projectMembers {
177 |                     edges {
178 |                         node {
179 |                         id
180 |                         accessLevel {
181 |                             stringValue
182 |                         }
183 |                         user {
184 |                             id
185 |                             name
186 |                             username
187 |                             publicEmail
188 |                             webUrl
189 |                         }
190 |                         }
191 |                     }
192 |                 }
193 |                 mergeRequests{
194 |                     edges {
195 |                     node {
196 |                         author {
197 |                         id
198 |                         name
199 |                         username
200 |                         publicEmail
201 |                         webUrl
202 |                         }
203 |                     }
204 |                     }
205 |                 }
206 |                 repository {
207 |                     rootRef
208 |                     tree{
209 |                         blobs{
210 |                             nodes {
211 |                                 name
212 |                                 webUrl
213 |                             }
214 |                         }
215 |                     }
216 |                 }
217 |                 releases {
218 |                     edges {
219 |                     node {
220 |                         name
221 |                         releasedAt
222 |                     }
223 |                     }
224 |                 }
225 |         }
226 |         }
227 |         """
228 |         response = send_graphql_query(
229 |             self.graphql_endpoint, project_query, data, self._headers
230 |         )
231 |         if "errors" in response:
232 |             raise ValueError(response["errors"])
233 | 
234 |         return response["data"]["project"]
235 | 
236 |     @cached_property
237 |     def _headers(self) -> Any:
238 |         """Set authentication headers for Gitlab API requests."""
239 |         try:
240 |             if not self.token:
241 |                 self.token = os.environ.get("GITLAB_TOKEN")
242 |                 assert self.token
243 |             headers = {"Authorization": f"token {self.token}"}
244 | 
245 |             login = requests.get(f"{self.rest_endpoint}/user", headers=headers)
246 |             assert login.json().get("login")
247 |         except AssertionError:
248 |             return {}
249 |         else:
250 |             return headers
251 | 
252 |     def _get_author(self, node: Dict[str, Any]) -> Union[Organization, Person]:
253 |         """Given the GraphQL node for a repository owner,
254 |         return the author as a Person or Organization object."""
255 |         # Is this the best test?
256 |         if "username" in node:
257 |             return self._get_user(node)
258 |         return self._get_organization(node)
259 | 
260 |     def _get_organization(self, node: Dict[str, Any]) -> Organization:
261 |         """Extract details from a GraphQL organization node."""
262 |         return Organization(
263 |             _id=node["webUrl"],
264 |             name=node["name"],
265 |             description=node.get("description"),
266 |             logo=node.get("avatarUrl"),
267 |         )
268 | 
269 |     def _get_user(self, node: Dict[str, Any]) -> Person:
270 |         """Extract details from a GraphQL user node."""
271 |         return Person(
272 |             _id=node["webUrl"],
273 |             identifier=node["username"],
274 |             name=node.get("name"),
275 |             email=node.get("publicEmail"),
276 |         )
277 | 
278 |     def _user_from_rest(self, username: str) -> Person:
279 |         """Given a username, use the REST API to retrieve the Person object."""
280 | 
281 |         author = send_rest_query(
282 |             self.rest_endpoint,
283 |             f"/users?username={username}",
284 |             self._headers,
285 |         )
286 |         if isinstance(author, list):
287 |             author = author[0]
288 | 
289 |         return Person(
290 |             _id=author["web_url"],
291 |             identifier=author["username"],
292 |             name=author.get("name"),
293 |         )
294 | 
295 |     @property
296 |     def rest_endpoint(self) -> str:
297 |         return f"{self.base}/api/v4/"
298 | 
299 |     @property
300 |     def graphql_endpoint(self) -> str:
301 |         return f"{self.base}/api"
302 | 


--------------------------------------------------------------------------------
/gimie/extractors/github.py:
--------------------------------------------------------------------------------
  1 | # Gimie
  2 | # Copyright 2022 - Swiss Data Science Center (SDSC)
  3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
  4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | from __future__ import annotations
 18 | 
 19 | from dataclasses import dataclass
 20 | from dateutil.parser import isoparse
 21 | from functools import cached_property
 22 | import os
 23 | import requests
 24 | from typing import Any, Dict, List, Optional, Union
 25 | from urllib.parse import urlparse
 26 | from dotenv import load_dotenv
 27 | 
 28 | from gimie.extractors.abstract import Extractor
 29 | from gimie.models import (
 30 |     Organization,
 31 |     Person,
 32 |     Repository,
 33 | )
 34 | 
 35 | from gimie.io import RemoteResource
 36 | from gimie.extractors.common.queries import (
 37 |     send_rest_query,
 38 |     send_graphql_query,
 39 | )
 40 | 
 41 | GH_API = "https://api.github.com"
 42 | load_dotenv()
 43 | 
 44 | 
 45 | def query_contributors(
 46 |     url: str, headers: Dict[str, str]
 47 | ) -> List[Dict[str, Any]]:
 48 |     """Queries the list of contributors of target repository
 49 |     using GitHub's REST and GraphQL APIs. Returns a list of GraphQL User nodes.
 50 |     NOTE: This is a workaround for the lack of a contributors field in the GraphQL API.
 51 |     """
 52 |     owner, name = urlparse(url).path.strip("/").split("/")
 53 |     # Get contributors (available in the REST API but not GraphQL)
 54 |     data = f"repos/{owner}/{name}/contributors"
 55 |     contributors = send_rest_query(GH_API, data, headers=headers)
 56 |     ids = [contributor["node_id"] for contributor in contributors]
 57 |     # Get all contributors' metadata in 1 GraphQL query
 58 |     users_query = """
 59 |     query users($ids: [ID!]!) {
 60 |         nodes(ids: $ids) {
 61 |             ... on User {
 62 |                 avatarUrl
 63 |                 company
 64 |                 login
 65 |                 name
 66 |                 organizations(first: 100) {
 67 |                     nodes {
 68 |                         avatarUrl
 69 |                         description
 70 |                         login
 71 |                         name
 72 |                         url
 73 |                     }
 74 |                 }
 75 |                 url
 76 |             }
 77 |         }
 78 |     }"""
 79 | 
 80 |     contributors = send_graphql_query(
 81 |         GH_API, users_query, data={"ids": ids}, headers=headers
 82 |     )
 83 |     # Drop empty users (e.g. dependabot)
 84 |     return [user for user in contributors["data"]["nodes"] if user]
 85 | 
 86 | 
 87 | @dataclass
 88 | class GithubExtractor(Extractor):
 89 |     """Extractor for GitHub repositories. Uses the GitHub GraphQL API to
 90 |     extract metadata into linked data.
 91 |     url: str
 92 |         The url of the git repository.
 93 |     base_url: Optional[str]
 94 |         The base url of the git remote.
 95 |     """
 96 | 
 97 |     url: str
 98 |     base_url: Optional[str] = None
 99 |     local_path: Optional[str] = None
100 | 
101 |     token: Optional[str] = None
102 | 
103 |     def list_files(self) -> List[RemoteResource]:
104 |         """takes the root repository folder and returns the list of files present"""
105 |         file_list = []
106 |         file_dict = self._repo_data["object"]["entries"]
107 |         repo_url = self._repo_data["url"]
108 |         defaultbranchref = self._repo_data["defaultBranchRef"]["name"]
109 | 
110 |         for item in file_dict:
111 |             file = RemoteResource(
112 |                 path=item["name"],
113 |                 url=f'{repo_url}/raw/{defaultbranchref}/{item["path"]}',
114 |                 headers=self._headers,
115 |             )
116 |             file_list.append(file)
117 |         return file_list
118 | 
119 |     def extract(self) -> Repository:
120 |         """Extract metadata from target GitHub repository."""
121 |         data = self._repo_data
122 | 
123 |         repo_meta = dict(
124 |             authors=[self._get_author(data["owner"])],
125 |             contributors=self._fetch_contributors(),
126 |             date_created=isoparse(data["createdAt"][:-1]),
127 |             date_modified=isoparse(data["updatedAt"][:-1]),
128 |             description=data["description"],
129 |             name=self.path,
130 |             keywords=self._get_keywords(*data["repositoryTopics"]["nodes"]),
131 |             url=self.url,
132 |         )
133 |         if data["parent"]:
134 |             repo_meta["parent_repository"] = data["parent"]["url"]
135 | 
136 |         if data["latestRelease"]:
137 |             repo_meta["date_published"] = isoparse(
138 |                 data["latestRelease"]["publishedAt"]
139 |             )
140 | 
141 |         if data["primaryLanguage"] is not None:
142 |             repo_meta["prog_langs"] = [data["primaryLanguage"]["name"]]
143 | 
144 |         if data["latestRelease"]:
145 |             version = data["latestRelease"]["name"]
146 |             download_url = f"{self.url}/archive/refs/tags/{version}.tar.gz"
147 |             repo_meta["download_url"] = download_url
148 |             repo_meta["version"] = version
149 | 
150 |         return Repository(**repo_meta)  # type: ignore
151 | 
152 |     @cached_property
153 |     def _repo_data(self) -> Dict[str, Any]:
154 |         """Repository metadata fetched from GraphQL endpoint."""
155 |         owner, name = self.path.split("/")
156 |         data = {"owner": owner, "name": name}
157 |         repo_query = """
158 |         query repo($owner: String!, $name: String!) {
159 |             repository(name: $name, owner: $owner) {
160 |                 url
161 |                 parent {url}
162 |                 createdAt
163 |                 description
164 |                 latestRelease {
165 |                     publishedAt
166 |                     name
167 |                 }
168 |                 defaultBranchRef {
169 |                     name
170 |                 }
171 |                 object(expression: "HEAD:") {
172 |                     ... on Tree {
173 | 
174 |                         entries {
175 |                             name
176 |                             path
177 |                             }
178 |                         }
179 |                     }
180 |                 mentionableUsers(first: 100) {
181 |                     nodes {
182 |                         login
183 |                         name
184 |                         avatarUrl
185 |                         company
186 |                         organizations(first: 100) {
187 |                             nodes {
188 |                                 avatarUrl
189 |                                 description
190 |                                 login
191 |                                 name
192 |                                 url
193 |                             }
194 |                         }
195 |                         url
196 |                     }
197 |                 }
198 |                 name
199 |                 owner {
200 |                     avatarUrl
201 |                     login
202 |                     url
203 |                     ... on User {
204 |                         company
205 |                         name
206 |                         organizations(first: 100) {
207 |                             nodes {
208 |                                 avatarUrl
209 |                                 description
210 |                                 login
211 |                                 name
212 |                                 url
213 |                             }
214 |                         }
215 |                     }
216 |                     ... on Organization {
217 |                         name
218 |                         description
219 |                     }
220 |                 }
221 |                 primaryLanguage {
222 |                     name
223 |                 }
224 |                 repositoryTopics(first: 10) {
225 |                     nodes {
226 |                         topic {
227 |                             name
228 |                         }
229 |                     }
230 |                 }
231 |                 updatedAt
232 |                 url
233 |             }
234 |         }
235 |         """
236 |         response = send_graphql_query(GH_API, repo_query, data, self._headers)
237 | 
238 |         if "errors" in response:
239 |             raise ValueError(response["errors"])
240 | 
241 |         return response["data"]["repository"]
242 | 
243 |     def _fetch_contributors(self) -> List[Person]:
244 |         """Queries the GitHub GraphQL API to extract contributors through the commit list.
245 |         NOTE: This is a workaround for the lack of a contributors field in the GraphQL API.
246 |         """
247 |         contributors = []
248 |         resp = query_contributors(self.url, self._headers)
249 |         for user in resp:
250 |             contributors.append(self._get_user(user))
251 |         return list(contributors)
252 | 
253 |     @cached_property
254 |     def _headers(self) -> Any:
255 |         """Set authentication headers for GitHub API requests."""
256 |         try:
257 |             if not self.token:
258 |                 self.token = os.environ.get("GITHUB_TOKEN")
259 |                 if not self.token:
260 |                     raise ValueError(
261 |                         "GitHub token not found. Please set the GITHUB_TOKEN environment variable "
262 |                         "with your GitHub personal access token."
263 |                     )
264 |             headers = {"Authorization": f"token {self.token}"}
265 | 
266 |             login = requests.get(f"{GH_API}/user", headers=headers)
267 |             if not login.ok or not login.json().get("login"):
268 |                 raise ValueError(
269 |                     "GitHub authentication failed. Please check that your GITHUB_TOKEN is valid."
270 |                 )
271 |             return headers
272 |         except requests.exceptions.RequestException as e:
273 |             raise ConnectionError(f"Failed to connect to GitHub API: {str(e)}")
274 | 
275 |     def _get_keywords(self, *nodes: Dict[str, Any]) -> List[str]:
276 |         """Extract names from GraphQL topic nodes."""
277 |         return [node["topic"]["name"] for node in nodes]
278 | 
279 |     def _get_organization(self, node: Dict[str, Any]) -> Organization:
280 |         """Extract details from a GraphQL organization node."""
281 |         return Organization(
282 |             _id=node["url"],
283 |             name=node["login"],
284 |             description=node["description"],
285 |             legal_name=node["name"],
286 |             logo=node["avatarUrl"],
287 |         )
288 | 
289 |     def _get_author(self, node: Dict[str, Any]) -> Union[Organization, Person]:
290 |         """Given the GraphQL node for a repository owner,
291 |         return the author as a Person or Organization object."""
292 | 
293 |         if "organizations" in node:
294 |             return self._get_user(node)
295 | 
296 |         return self._get_organization(node)
297 | 
298 |     def _get_user(self, node: Dict[str, Any]) -> Person:
299 |         """Extract details from a GraphQL user node."""
300 |         # Get user's affiliations
301 |         orgs = [
302 |             self._get_organization(org)
303 |             for org in node["organizations"]["nodes"]
304 |         ]
305 |         return Person(
306 |             _id=node["url"],
307 |             identifier=node["login"],
308 |             name=node["name"],
309 |             affiliations=orgs,
310 |         )
311 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/gimie/parsers/license/data/tfidf_vectorizer.json:
--------------------------------------------------------------------------------
1 | {"config":{"max_features":700,"ngram_range":[1,2],"smooth_idf":true,"vocabulary":null,"norm":"l2","sublinear_tf":true},"idf_vector":[1.2724146003209924,1.3457458734065422,5.174387269895637,2.535329940280379,2.535329940280379,4.481240089335692,4.768922161787472,4.258096538021482,5.174387269895637,2.7764919970972666,4.768922161787472,2.7764919970972666,2.6486586255873816,4.768922161787472,5.174387269895637,3.921624301400269,4.258096538021482,5.174387269895637,2.203972804325936,2.203972804325936,1.4608152031913293,1.4367176516122688,2.0173868487455238,3.5649493574615367,4.258096538021482,4.768922161787472,4.768922161787472,2.3121863889661687,5.174387269895637,2.2840155119994723,5.174387269895637,4.768922161787472,1.9555114450274362,2.7320402345264325,2.2299482907291965,3.7880929087757464,2.341173925839421,2.3710268889891024,3.921624301400269,1.0472528848505456,3.921624301400269,2.6486586255873816,1.550046336919272,3.921624301400269,4.481240089335692,1.1670540846631663,3.159484249353372,2.689480620107637,2.6094379124341005,3.228477120840324,2.203972804325936,1.4855078157817008,2.3121863889661687,2.7764919970972666,1.807091439909163,2.083344816537321,3.3826278006675823,3.3826278006675823,3.3025850929940455,2.038893053966487,1.6190392084062235,1.149035579160488,1.6190392084062235,1.303186258987746,1.023347363996991,2.083344816537321,1.6628418310646165,1.789997006549863,1.3566749439387324,1.3788980807234426,1.0312525435041044,2.401798547655856,2.466337068793427,4.768922161787472,3.0343211063993665,1.8244831826210324,5.174387269895637,1.4855078157817008,2.0173868487455238,2.6094379124341005,2.535329940280379,2.083344816537321,2.689480620107637,4.258096538021482,4.258096538021482,4.258096538021482,1.550046336919272,1.916290731874155,1.5108256237659907,1.1854032233313627,2.2299482907291965,3.3826278006675823,1.252413933614323,1.5368011101692514,1.498086597988561,2.6486586255873816,4.258096538021482,4.481240089335692,1.590868331439527,2.689480620107637,4.481240089335692,1.8244831826210324,2.0608719606852626,1.0635134057223259,1.6931471805599454,5.174387269895637,2.3121863889661687,5.174387269895637,1.2328054622259468,1.131336002061087,4.258096538021482,2.923095471289142,2.3121863889661687,5.174387269895637,1.252413933614323,1.916290731874155,2.923095471289142,4.481240089335692,1.8421827597204332,1.7731898882334818,4.481240089335692,4.481240089335692,2.1063343347620203,2.3121863889661687,1.7404000654104907,1.7086513670959107,1.604854573414267,4.768922161787472,1.2231435513142097,2.178654996341646,4.768922161787472,2.1539623837512747,2.466337068793427,2.203972804325936,2.0608719606852626,3.469639177657212,2.1539623837512747,2.689480620107637,3.228477120840324,1.7566605862822713,4.075774981227527,4.481240089335692,3.3025850929940455,4.258096538021482,3.469639177657212,5.174387269895637,2.7764919970972666,2.8230120127321596,4.481240089335692,4.481240089335692,1.3136575588550414,2.571697584451253,2.401798547655856,1.9555114450274362,2.535329940280379,2.923095471289142,1.7566605862822713,3.094945728215801,3.670309873119363,3.3025850929940455,3.5649493574615367,1.424883193965266,1.6480267452794757,1.2135741002980591,1.3566749439387324,1.0968498259899175,2.3121863889661687,2.8718021769015913,2.6486586255873816,3.670309873119363,1.9555114450274362,3.228477120840324,4.075774981227527,4.481240089335692,3.921624301400269,1.8421827597204332,3.469639177657212,5.174387269895637,3.469639177657212,4.481240089335692,1.149035579160488,1.550046336919272,5.174387269895637,5.174387269895637,4.258096538021482,4.258096538021482,3.670309873119363,1.7731898882334818,2.689480620107637,1.8421827597204332,1.677879708429157,5.174387269895637,4.258096538021482,4.481240089335692,2.7764919970972666,3.3025850929940455,3.7880929087757464,4.075774981227527,3.921624301400269,1.7566605862822713,1.1947056159936762,2.9771626925594177,1.6931471805599454,1.5770750093071912,2.8230120127321596,1.3788980807234426,1.7086513670959107,2.8718021769015913,1.4486938426589844,1.677879708429157,1.633427945858323,1.633427945858323,4.481240089335692,3.670309873119363,5.174387269895637,4.481240089335692,4.481240089335692,4.481240089335692,3.469639177657212,1.424883193965266,2.433547245970436,1.5237290286018985,3.3826278006675823,1.807091439909163,4.481240089335692,3.0343211063993665,5.174387269895637,5.174387269895637,4.481240089335692,4.481240089335692,1.5237290286018985,2.0173868487455238,4.481240089335692,5.174387269895637,2.689480620107637,2.7320402345264325,1.807091439909163,3.670309873119363,1.633427945858323,4.481240089335692,3.0343211063993665,1.8244831826210324,1.6480267452794757,1.252413933614323,4.258096538021482,4.481240089335692,1.0392207131532814,1.149035579160488,1.0717439048588413,1.5368011101692514,1.252413933614323,2.401798547655856,2.571697584451253,2.401798547655856,5.174387269895637,5.174387269895637,5.174387269895637,1.6480267452794757,2.6486586255873816,2.8718021769015913,1.1226023220923322,1.5770750093071912,2.1298648321722142,2.6094379124341005,2.401798547655856,2.8230120127321596,3.3826278006675823,2.8230120127321596,4.075774981227527,3.469639177657212,2.038893053966487,5.174387269895637,2.401798547655856,5.174387269895637,2.923095471289142,5.174387269895637,3.0343211063993665,1.6190392084062235,1.2928234719521994,1.4608152031913293,1.4608152031913293,2.571697584451253,1.4486938426589844,1.9963334395476915,2.2840155119994723,1.3788980807234426,1.1580042491432483,1.916290731874155,1.677879708429157,1.0472528848505456,1.590868331439527,1.2825669717850103,3.094945728215801,1.149035579160488,1.590868331439527,1.5368011101692514,1.0717439048588413,1.677879708429157,2.571697584451253,3.7880929087757464,3.5649493574615367,5.174387269895637,1.9757141523449557,1.9757141523449557,2.3121863889661687,1.023347363996991,2.1063343347620203,1.8602012652231115,1.3788980807234426,1.8244831826210324,4.768922161787472,1.2928234719521994,2.2840155119994723,4.258096538021482,4.481240089335692,2.2299482907291965,5.174387269895637,1.5634693572514127,3.3826278006675823,4.481240089335692,4.481240089335692,1.633427945858323,3.469639177657212,3.228477120840324,3.228477120840324,1.3677247801253174,3.469639177657212,2.8718021769015913,3.921624301400269,4.768922161787472,3.921624301400269,3.7880929087757464,3.7880929087757464,3.670309873119363,4.481240089335692,1.1854032233313627,2.083344816537321,2.7320402345264325,1.807091439909163,1.677879708429157,1.916290731874155,1.604854573414267,2.083344816537321,1.8421827597204332,5.174387269895637,5.174387269895637,4.768922161787472,5.174387269895637,2.3710268889891024,1.9757141523449557,2.256616537811358,3.0343211063993665,2.571697584451253,1.4730852957831435,4.481240089335692,1.3242396681855786,1.4367176516122688,2.0173868487455238,2.1298648321722142,5.174387269895637,5.174387269895637,1.390197635977376,1.7086513670959107,2.401798547655856,2.466337068793427,5.174387269895637,1.2425616371713113,1.550046336919272,3.5649493574615367,2.6486586255873816,1.5634693572514127,2.178654996341646,1.7404000654104907,1.390197635977376,1.4486938426589844,2.8230120127321596,1.5770750093071912,4.481240089335692,2.466337068793427,1.262364264467491,4.481240089335692,1.6480267452794757,5.174387269895637,5.174387269895637,1.2425616371713113,1.8972425369034605,5.174387269895637,2.689480620107637,1.789997006549863,1.055350095083165,5.174387269895637,5.174387269895637,1.0635134057223259,1.2135741002980591,1.4486938426589844,2.0173868487455238,2.1063343347620203,1.8244831826210324,5.174387269895637,1.0312525435041044,1.6628418310646165,1.2825669717850103,2.8718021769015913,4.768922161787472,4.481240089335692,5.174387269895637,1.3457458734065422,1.8785504038913081,1.055350095083165,1.1053605156578263,1.5770750093071912,1.1761865682264387,2.6486586255873816,1.8244831826210324,1.789997006549863,1.498086597988561,2.1063343347620203,2.401798547655856,1.0392207131532814,1.807091439909163,2.923095471289142,1.498086597988561,4.075774981227527,1.8244831826210324,1.550046336919272,1.5770750093071912,2.8718021769015913,1.4855078157817008,3.159484249353372,5.174387269895637,3.921624301400269,2.8718021769015913,5.174387269895637,5.174387269895637,1.1139442593492177,1.3349349573023266,4.481240089335692,4.481240089335692,1.3136575588550414,2.7320402345264325,3.921624301400269,3.670309873119363,4.481240089335692,3.3025850929940455,1.5237290286018985,5.174387269895637,5.174387269895637,1.6931471805599454,3.7880929087757464,1.1139442593492177,1.8421827597204332,1.590868331439527,4.481240089335692,1.5237290286018985,1.7404000654104907,1.6190392084062235,2.923095471289142,1.7566605862822713,1.8972425369034605,4.768922161787472,2.5002386204691085,5.174387269895637,2.1298648321722142,1.8602012652231115,3.7880929087757464,1.8785504038913081,2.178654996341646,2.7764919970972666,2.401798547655856,2.1539623837512747,3.921624301400269,1.7566605862822713,2.341173925839421,1.7566605862822713,1.055350095083165,1.390197635977376,1.9357088177312565,4.481240089335692,4.481240089335692,5.174387269895637,1.5237290286018985,2.0173868487455238,2.038893053966487,4.768922161787472,4.768922161787472,3.469639177657212,4.481240089335692,4.258096538021482,5.174387269895637,2.571697584451253,2.0173868487455238,2.083344816537321,3.7880929087757464,2.433547245970436,1.6628418310646165,2.8230120127321596,2.6486586255873816,1.7243997240640498,1.8602012652231115,2.038893053966487,2.1298648321722142,1.916290731874155,1.4608152031913293,1.262364264467491,5.174387269895637,4.768922161787472,5.174387269895637,2.923095471289142,3.3826278006675823,3.921624301400269,1.6931471805599454,2.0608719606852626,3.3826278006675823,2.203972804325936,2.401798547655856,1.1580042491432483,1.6480267452794757,3.670309873119363,1.789997006549863,1.4486938426589844,1.0472528848505456,2.0173868487455238,2.7764919970972666,1.677879708429157,1.6190392084062235,1.1580042491432483,3.670309873119363,2.401798547655856,1.3566749439387324,3.228477120840324,2.7320402345264325,2.9771626925594177,4.075774981227527,2.3121863889661687,1.424883193965266,2.535329940280379,5.174387269895637,1.4367176516122688,1.9963334395476915,4.258096538021482,5.174387269895637,1.1761865682264387,2.341173925839421,3.7880929087757464,5.174387269895637,3.670309873119363,5.174387269895637,5.174387269895637,5.174387269895637,5.174387269895637,2.923095471289142,3.0343211063993665,4.481240089335692,2.0608719606852626,1.424883193965266,1.7731898882334818,1.5368011101692514,2.0608719606852626,1.6628418310646165,1.088410957344053,1.8785504038913081,2.5002386204691085,5.174387269895637,1.252413933614323,3.670309873119363,1.5770750093071912,1.0312525435041044,3.5649493574615367,3.469639177657212,2.535329940280379,3.921624301400269,2.2299482907291965,2.5002386204691085,1.590868331439527,3.159484249353372,2.571697584451253,4.481240089335692,4.768922161787472,4.481240089335692,1.2724146003209924,4.258096538021482,5.174387269895637,2.8718021769015913,2.923095471289142,5.174387269895637,2.8230120127321596,3.670309873119363,4.258096538021482,4.258096538021482,1.9757141523449557,2.689480620107637,3.670309873119363,2.5002386204691085,2.341173925839421,2.535329940280379,2.8230120127321596,4.258096538021482,1.604854573414267,4.075774981227527,4.768922161787472,2.7320402345264325,3.3025850929940455,2.6094379124341005,4.768922161787472,1.7566605862822713,1.604854573414267,4.075774981227527,2.8718021769015913,5.174387269895637,5.174387269895637,1.4608152031913293,2.535329940280379,2.178654996341646,1.8421827597204332,2.1298648321722142,3.0343211063993665,2.0608719606852626,1.8785504038913081,1.7731898882334818,1.9555114450274362,1.0312525435041044,3.0343211063993665,4.075774981227527,1.424883193965266,3.7880929087757464,1.5368011101692514,1.6480267452794757,1.0312525435041044,3.670309873119363,1.9963334395476915,1.390197635977376,2.3121863889661687,1.0717439048588413,1.633427945858323,1.2328054622259468,4.481240089335692,1.9555114450274362,4.258096538021482,1.2825669717850103,2.2299482907291965,1.4367176516122688,1.4730852957831435,4.481240089335692,3.7880929087757464,2.256616537811358,2.6486586255873816,2.2299482907291965,1.0312525435041044,1.3242396681855786,1.3457458734065422,2.1298648321722142,1.916290731874155,2.2840155119994723,3.5649493574615367,1.7404000654104907,2.9771626925594177,1.303186258987746,1.7086513670959107,1.7243997240640498,1.9757141523449557,1.1053605156578263,1.3677247801253174,2.8230120127321596,3.094945728215801,1.7404000654104907,1.2328054622259468,1.4016263318009987,2.535329940280379,1.8785504038913081,2.1539623837512747,1.6190392084062235,1.0968498259899175,2.178654996341646,2.1539623837512747,1.1854032233313627,1.8972425369034605,1.0635134057223259,1.6190392084062235,1.590868331439527,5.174387269895637,2.083344816537321,3.159484249353372,2.433547245970436,2.2299482907291965,2.466337068793427,1.550046336919272,1.9555114450274362,1.390197635977376,3.670309873119363,1.9555114450274362,1.916290731874155,1.5634693572514127,1.7404000654104907,1.5634693572514127,4.768922161787472,3.3025850929940455,4.768922161787472,5.174387269895637,4.258096538021482,4.481240089335692],"vocabulary":{"":0," ":1," %":2," (a)":3," (b)":4," *":5," **":6," -":7," article":8," if":9," means":10," this":11," you":12,"\"third":13,"\"third party\"":14,"\"work":15,"\"work that":16,"%":17,"(a)":18,"(b)":19,"(c)":20,"(including":21,"(or":22,"(or any":23,"*":24,"**":25,"** ":26,"-":27,"- the":28,"1":29,"1 of":30,"16b1":31,"2":32,"22":33,"3":34,"3 of":35,"30":36,"6":37,"6 of":38,"a":39,"a \"work":40,"a contributor":41,"a copy":42,"a covered":43,"a derived":44,"a particular":45,"a product":46,"a program":47,"a recipient":48,"a subsequent":49,"a work":50,"above":51,"access":52,"access to":53,"action":54,"additional":55,"affero":56,"affero general":57,"agency":58,"agree":59,"agreement":60,"all":61,"also":62,"an":63,"and":64,"and ":65,"and all":66,"and conditions":67,"and the":68,"and/or":69,"any":70,"any contributor":71,"any derivative":72,"any extensions":73,"any modifications":74,"any of":75,"any or":76,"any other":77,"any person":78,"any portion":79,"any subsequent":80,"any such":81,"any third":82,"apple":83,"apple and":84,"apple's":85,"applicable":86,"application":87,"apply":88,"are":89,"are not":90,"article":91,"as":92,"as a":93,"at":94,"attribution":95,"attribution information":96,"au":97,"available":98,"b":99,"base":100,"based":101,"based on":102,"be":103,"been":104,"beopen":105,"both":106,"bull":107,"but":108,"by":109,"by apple":110,"by licensor":111,"by such":112,"by sybase":113,"by the":114,"by this":115,"c":116,"ca":117,"can":118,"case":119,"case may":120,"cern":121,"change":122,"changes":123,"charge":124,"claim":125,"claims":126,"cnri":127,"code":128,"code and":129,"code base":130,"code form":131,"code is":132,"code of":133,"code or":134,"code version":135,"combination":136,"combined":137,"combined work":138,"commercial":139,"commercial contributor":140,"commercial distributor":141,"communicate":142,"company":143,"compatible":144,"compatible source":145,"compiled":146,"component":147,"component of":148,"concédant":149,"conditions":150,"contribution":151,"contributions":152,"contributor":153,"contributor and":154,"contributor version":155,"contributors":156,"convey":157,"convey a":158,"convey the":159,"conveying":160,"copies":161,"copies of":162,"copy":163,"copy of":164,"copyright":165,"copyright holder":166,"corporation":167,"corresponding":168,"corresponding source":169,"covered":170,"covered code":171,"covered software":172,"covered source":173,"covered work":174,"create":175,"current":176,"current maintainer":177,"d":178,"d'auteur":179,"damages":180,"data":181,"data files":182,"datagrid":183,"de":184,"de la":185,"deploy":186,"derivative":187,"derivative work":188,"derivative works":189,"derived":190,"derived program":191,"derived work":192,"des":193,"description":194,"developer":195,"developer and":196,"developer original":197,"digital":198,"display":199,"distribute":200,"distribute or":201,"distribute the":202,"distributed":203,"distributed by":204,"distribution":205,"distribution of":206,"distributor":207,"do":208,"do not":209,"does":210,"does not":211,"doit":212,"downstream":213,"downstream distribution":214,"droit":215,"du":216,"du logiciel":217,"e":218,"each":219,"each contributor":220,"either":221,"en":222,"entity":223,"est":224,"et":225,"eu":226,"eu datagrid":227,"european":228,"european union":229,"except":230,"executable":231,"executable code":232,"executable distribution":233,"exhibit":234,"exhibit a":235,"expressly":236,"extensions":237,"extent":238,"externally":239,"federal":240,"file":241,"files":242,"following":243,"font":244,"font software":245,"for":246,"for a":247,"for any":248,"for the":249,"form":250,"forth":251,"forth in":252,"foundation":253,"frameworx":254,"frameworx code":255,"frameworx company":256,"free":257,"free software":258,"freedom":259,"from":260,"from the":261,"general":262,"general public":263,"give":264,"gnu":265,"gnu affero":266,"gnu general":267,"gnu gpl":268,"gnu lesser":269,"governed":270,"governed code":271,"governing":272,"governing jurisdiction":273,"government":274,"government agency":275,"gpl":276,"grant":277,"granted":278,"has":279,"have":280,"having":281,"hereby":282,"hereunder":283,"holder":284,"however":285,"if":286,"if the":287,"if you":288,"in":289,"in a":290,"in any":291,"in part":292,"in the":293,"in this":294,"include":295,"including":296,"information":297,"initial":298,"initial contributor":299,"initial developer":300,"initial work":301,"intellectual":302,"intellectual property":303,"interface":304,"is":305,"is a":306,"is not":307,"it":308,"it is":309,"items":310,"its":311,"jurisdiction":312,"la":313,"la licence":314,"larger":315,"latex":316,"law":317,"le":318,"le concédant":319,"le logiciel":320,"legal":321,"les":322,"lesser":323,"lesser general":324,"liability":325,"libraries":326,"library":327,"library and":328,"library general":329,"library is":330,"library or":331,"library\"":332,"licence":333,"licencié":334,"license":335,"license ":336,"license agreement":337,"license and":338,"license is":339,"license shall":340,"license to":341,"license you":342,"licensed":343,"licensed product":344,"licensed program":345,"licensed software":346,"licensed work":347,"licensee":348,"licenses":349,"licensor":350,"licensor and":351,"licensor or":352,"limitation":353,"logiciel":354,"loss":355,"made":356,"made available":357,"made by":358,"maintainer":359,"maintainer of":360,"make":361,"making":362,"material":363,"matter":364,"matter of":365,"may":366,"may be":367,"may convey":368,"mean":369,"means":370,"means any":371,"means the":372,"modification":373,"modifications":374,"modifications made":375,"modified":376,"modified covered":377,"modified version":378,"modify":379,"module":380,"more":381,"mulan":382,"mulan psl":383,"must":384,"must be":385,"nethack":386,"network":387,"new":388,"no":389,"nokia":390,"non-profit":391,"not":392,"notice":393,"notices":394,"object":395,"object code":396,"obligations":397,"oclc":398,"of":399,"of a":400,"of any":401,"of covered":402,"of exhibit":403,"of licensed":404,"of nethack":405,"of such":406,"of that":407,"of the":408,"of this":409,"offer":410,"on":411,"on a":412,"on the":413,"one":414,"only":415,"open":416,"open source":417,"or":418,"or a":419,"or all":420,"or any":421,"or communicate":422,"or in":423,"or other":424,"or otherwise":425,"ordinary":426,"original":427,"original code":428,"original developer":429,"original software":430,"original work":431,"osl":432,"osl 30":433,"other":434,"otherwise":435,"otherwise using":436,"ou":437,"out":438,"over":439,"over the":440,"package":441,"par":442,"paragraphs":443,"part":444,"part 1":445,"part 6":446,"part of":447,"participant":448,"particular":449,"parties":450,"party":451,"party\"":452,"patent":453,"patent license":454,"permission":455,"permissions":456,"permitted":457,"person":458,"php":459,"portion":460,"portion thereof)":461,"portions":462,"product":463,"product or":464,"products":465,"program":466,"program is":467,"program or":468,"programs":469,"propagate":470,"property":471,"property rights":472,"provide":473,"provided":474,"provided that":475,"provisions":476,"présente":477,"présente licence":478,"psl":479,"public":480,"public license":481,"publicly":482,"python":483,"python 16b1":484,"que":485,"qui":486,"québec":487,"realnetworks":488,"receive":489,"received":490,"recipient":491,"recipient may":492,"recipient's":493,"recipients":494,"redistributions":495,"reproduction":496,"required":497,"requirements":498,"respect":499,"respect to":500,"resulting":501,"right":502,"rights":503,"rights over":504,"rpl":505,"rsv":506,"run":507,"said":508,"secondary":509,"section":510,"sections":511,"server":512,"set":513,"set forth":514,"shall":515,"shall be":516,"shall mean":517,"shall not":518,"so":519,"software":520,"software and":521,"software foundation":522,"software is":523,"software or":524,"source":525,"source ":526,"source and":527,"source code":528,"source license":529,"source or":530,"standard":531,"standard version":532,"states":533,"subject":534,"subject matter":535,"subject software":536,"subject to":537,"subsequent":538,"subsequent contributor":539,"subsequent work":540,"such":541,"such contributor":542,"such recipient":543,"such subsequent":544,"supplement":545,"supplement file":546,"sybase":547,"sybase and":548,"sybase or":549,"system":550,"systems":551,"termes":552,"termination":553,"terms":554,"terms and":555,"terms of":556,"text":557,"than":558,"that":559,"that is":560,"that it":561,"that subsequent":562,"that the":563,"that uses":564,"that you":565,"the":566,"the ":567,"the agreement":568,"the case":569,"the combined":570,"the conditions":571,"the contributor":572,"the copyright":573,"the corresponding":574,"the covered":575,"the current":576,"the derived":577,"the european":578,"the following":579,"the font":580,"the frameworx":581,"the free":582,"the gnu":583,"the governed":584,"the initial":585,"the library":586,"the library\"":587,"the licence":588,"the license":589,"the licensed":590,"the licensee":591,"the licensor":592,"the modified":593,"the notice":594,"the object":595,"the ordinary":596,"the original":597,"the package":598,"the php":599,"the program":600,"the provisions":601,"the recipient":602,"the rpl":603,"the software":604,"the source":605,"the standard":606,"the subject":607,"the subsequent":608,"the supplement":609,"the terms":610,"the work":611,"them":612,"then":613,"thereof":614,"thereof)":615,"these":616,"they":617,"third":618,"third party":619,"this":620,"this agreement":621,"this licence":622,"this license":623,"this package":624,"those":625,"time":626,"to":627,"to ":628,"to a":629,"to any":630,"to copy":631,"to the":632,"to this":633,"to use":634,"toute":635,"trademarks":636,"un":637,"under":638,"under a":639,"under the":640,"under this":641,"une":642,"union":643,"united":644,"united states":645,"upon":646,"use":647,"use of":648,"used":649,"user":650,"users":651,"uses":652,"uses the":653,"using":654,"v":655,"version":656,"version of":657,"versions":658,"versions of":659,"warranties":660,"warranty":661,"we":662,"web":663,"where":664,"whether":665,"which":666,"which you":667,"who":668,"whole":669,"will":670,"with":671,"with a":672,"with respect":673,"with the":674,"within":675,"without":676,"without limitation":677,"work":678,"work (or":679,"work and":680,"work based":681,"work in":682,"work is":683,"work or":684,"works":685,"would":686,"you":687,"you convey":688,"you distribute":689,"you have":690,"you may":691,"you must":692,"your":693,"your extensions":694,"your work":695,"zope":696,"zope corporation":697,"à":698,"être":699}}
2 | 


--------------------------------------------------------------------------------