├── .copier-answers.yml
├── .git-blame-ignore-revs
├── .gitignore
├── .pre-commit-config.yaml
├── AUTHORS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS
├── LICENSE
├── Makefile
├── Makefile.local
├── README.rst
├── codemeta.json
├── conftest.py
├── docs
    ├── .gitignore
    ├── Makefile
    ├── Makefile.local
    ├── _static
    │   └── .placeholder
    ├── _templates
    │   └── .placeholder
    ├── cli.rst
    ├── conf.py
    ├── images
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── metadata-flow.dot
    │   ├── tasks-extrinsic-metadata-indexers.uml
    │   └── tasks-intrinsic-metadata-indexers.uml
    ├── index.rst
    ├── metadata-workflow.rst
    └── swhpkg.rst
├── pyproject.toml
├── requirements-swh.txt
├── requirements-test.txt
├── requirements.txt
├── sql
    ├── bin
    │   ├── db-upgrade
    │   └── dot_add_content
    ├── doc
    │   ├── json
    │   └── sql
    └── json
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── indexer_configuration.tool_configuration.schema.json
    │   └── revision_metadata.translated_metadata.json
├── swh
    └── indexer
    │   ├── __init__.py
    │   ├── bibtex.py
    │   ├── cli.py
    │   ├── codemeta.py
    │   ├── data
    │       ├── Gitea.csv
    │       ├── codemeta
    │       │   ├── CITATION
    │       │   ├── LICENSE
    │       │   ├── codemeta-2.0.jsonld
    │       │   ├── codemeta-3.0.jsonld
    │       │   └── crosswalk.csv
    │       ├── composer.csv
    │       ├── nuget.csv
    │       ├── pubspec.csv
    │       └── schema.org
    │       │   ├── CITATION
    │       │   ├── LICENSE
    │       │   └── schemaorgcontext.jsonld
    │   ├── fossology_license.py
    │   ├── indexer.py
    │   ├── metadata.py
    │   ├── metadata_detector.py
    │   ├── metadata_dictionary
    │       ├── __init__.py
    │       ├── base.py
    │       ├── cff.py
    │       ├── codemeta.py
    │       ├── composer.py
    │       ├── dart.py
    │       ├── gitea.py
    │       ├── github.py
    │       ├── maven.py
    │       ├── npm.py
    │       ├── nuget.py
    │       ├── python.py
    │       ├── ruby.py
    │       └── utils.py
    │   ├── mimetype.py
    │   ├── namespaces.py
    │   ├── origin_head.py
    │   ├── py.typed
    │   ├── rehash.py
    │   ├── storage
    │       ├── __init__.py
    │       ├── api
    │       │   ├── __init__.py
    │       │   ├── client.py
    │       │   ├── serializers.py
    │       │   └── server.py
    │       ├── converters.py
    │       ├── db.py
    │       ├── exc.py
    │       ├── in_memory.py
    │       ├── interface.py
    │       ├── metrics.py
    │       ├── model.py
    │       ├── sql
    │       │   ├── 10-superuser-init.sql
    │       │   ├── 20-enums.sql
    │       │   ├── 30-schema.sql
    │       │   ├── 50-data.sql
    │       │   ├── 50-func.sql
    │       │   ├── 60-indexes.sql
    │       │   └── upgrades
    │       │   │   ├── 115.sql
    │       │   │   ├── 116.sql
    │       │   │   ├── 117.sql
    │       │   │   ├── 118.sql
    │       │   │   ├── 119.sql
    │       │   │   ├── 120.sql
    │       │   │   ├── 121.sql
    │       │   │   ├── 122.sql
    │       │   │   ├── 123.sql
    │       │   │   ├── 124.sql
    │       │   │   ├── 125.sql
    │       │   │   ├── 126.sql
    │       │   │   ├── 127.sql
    │       │   │   ├── 128.sql
    │       │   │   ├── 129.sql
    │       │   │   ├── 130.sql
    │       │   │   ├── 131.sql
    │       │   │   ├── 132.sql
    │       │   │   ├── 133.sql
    │       │   │   ├── 134.sql
    │       │   │   ├── 135.sql
    │       │   │   ├── 136.sql
    │       │   │   └── 137.sql
    │       └── writer.py
    │   └── tests
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── metadata_dictionary
    │           ├── __init__.py
    │           ├── test_cff.py
    │           ├── test_codemeta.py
    │           ├── test_composer.py
    │           ├── test_dart.py
    │           ├── test_gitea.py
    │           ├── test_github.py
    │           ├── test_maven.py
    │           ├── test_npm.py
    │           ├── test_nuget.py
    │           ├── test_python.py
    │           └── test_ruby.py
    │       ├── storage
    │           ├── __init__.py
    │           ├── conftest.py
    │           ├── generate_data_test.py
    │           ├── test_api_client.py
    │           ├── test_converters.py
    │           ├── test_in_memory.py
    │           ├── test_metrics.py
    │           ├── test_model.py
    │           ├── test_server.py
    │           └── test_storage.py
    │       ├── test_bibtex.py
    │       ├── test_cli.py
    │       ├── test_codemeta.py
    │       ├── test_fossology_license.py
    │       ├── test_indexer.py
    │       ├── test_metadata.py
    │       ├── test_mimetype.py
    │       ├── test_origin_head.py
    │       ├── test_origin_metadata.py
    │       └── utils.py
└── tox.ini


/.copier-answers.yml:
--------------------------------------------------------------------------------
 1 | # Changes here will be overwritten by Copier
 2 | _commit: v0.3.3
 3 | _src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
 4 | description: Software Heritage indexer
 5 | distribution_name: swh-indexer
 6 | have_cli: true
 7 | have_workers: true
 8 | package_root: swh/indexer
 9 | project_name: swh.indexer
10 | python_minimal_version: '3.7'
11 | readme_format: rst
12 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # python: Reformat code with black
2 | 5aa97ccd6ce29d6f66eb093c5d06e9030d7449fd
3 | 0f847f6119195649fe4108b776b9244940ebdb46
4 | 2e9f1d3e896062ae6b3cd99dc1a5d4148beebbf7
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info/
 2 | *.pyc
 3 | .coverage
 4 | .eggs/
 5 | .hypothesis
 6 | .mypy_cache
 7 | .tox
 8 | __pycache__
 9 | build/
10 | dist/
11 | # these are symlinks created by a hook in swh-docs' main sphinx conf.py
12 | docs/README.rst
13 | docs/README.md
14 | # this should be a symlink for people who want to build the sphinx doc
15 | # without using tox, generally created by the swh-env/bin/update script
16 | docs/Makefile.sphinx
17 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: check-json
 7 |       - id: check-yaml
 8 | 
 9 |   - repo: https://github.com/python/black
10 |     rev: 25.1.0
11 |     hooks:
12 |       - id: black
13 | 
14 |   - repo: https://github.com/PyCQA/isort
15 |     rev: 6.0.0
16 |     hooks:
17 |       - id: isort
18 | 
19 |   - repo: https://github.com/pycqa/flake8
20 |     rev: 7.1.1
21 |     hooks:
22 |       - id: flake8
23 |         additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]
24 | 
25 |   - repo: https://github.com/codespell-project/codespell
26 |     rev: v2.4.1
27 |     hooks:
28 |       - id: codespell
29 |         name: Check source code spelling
30 |         args: [-L assertIn]
31 |         exclude: ^(swh/indexer/data/)
32 |         stages: [pre-commit]
33 |       - id: codespell
34 |         name: Check commit message spelling
35 |         stages: [commit-msg]
36 | 
37 |   - repo: local
38 |     hooks:
39 |       - id: mypy
40 |         name: mypy
41 |         entry: mypy
42 |         args: [swh]
43 |         pass_filenames: false
44 |         language: system
45 |         types: [python]
46 |       - id: twine-check
47 |         name: twine check
48 |         description: call twine check when pushing an annotated release tag
49 |         entry: bash -c "ref=$(git describe) &&
50 |           [[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] &&
51 |           (python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true"
52 |         pass_filenames: false
53 |         stages: [pre-push]
54 |         language: python
55 |         additional_dependencies: [twine, build]
56 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Copyright (C) 2015-2017 The Software Heritage developers
2 | 
3 | See http://www.softwareheritage.org/ for more information.
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Software Heritage Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as Software
 6 | Heritage contributors and maintainers pledge to making participation in our
 7 | project and our community a harassment-free experience for everyone, regardless
 8 | of age, body size, disability, ethnicity, sex characteristics, gender identity
 9 | and expression, level of experience, education, socioeconomic status,
10 | nationality, personal appearance, race, religion, or sexual identity and
11 | orientation.
12 | 
13 | ## Our Standards
14 | 
15 | Examples of behavior that contributes to creating a positive environment
16 | include:
17 | 
18 | * Using welcoming and inclusive language
19 | * Being respectful of differing viewpoints and experiences
20 | * Gracefully accepting constructive criticism
21 | * Focusing on what is best for the community
22 | * Showing empathy towards other community members
23 | 
24 | Examples of unacceptable behavior by participants include:
25 | 
26 | * The use of sexualized language or imagery and unwelcome sexual attention or
27 |   advances
28 | * Trolling, insulting/derogatory comments, and personal or political attacks
29 | * Public or private harassment
30 | * Publishing others' private information, such as a physical or electronic
31 |   address, without explicit permission
32 | * Other conduct which could reasonably be considered inappropriate in a
33 |   professional setting
34 | 
35 | ## Our Responsibilities
36 | 
37 | Project maintainers are responsible for clarifying the standards of acceptable
38 | behavior and are expected to take appropriate and fair corrective action in
39 | response to any instances of unacceptable behavior.
40 | 
41 | Project maintainers have the right and responsibility to remove, edit, or
42 | reject comments, commits, code, wiki edits, issues, and other contributions
43 | that are not aligned to this Code of Conduct, or to ban temporarily or
44 | permanently any contributor for other behaviors that they deem inappropriate,
45 | threatening, offensive, or harmful.
46 | 
47 | ## Scope
48 | 
49 | This Code of Conduct applies within all project spaces, and it also applies when
50 | an individual is representing the project or its community in public spaces.
51 | Examples of representing a project or community include using an official
52 | project e-mail address, posting via an official social media account, or acting
53 | as an appointed representative at an online or offline event. Representation of
54 | a project may be further defined and clarified by project maintainers.
55 | 
56 | ## Enforcement
57 | 
58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
59 | reported by contacting the project team at `conduct@softwareheritage.org`. All
60 | complaints will be reviewed and investigated and will result in a response that
61 | is deemed necessary and appropriate to the circumstances. The project team is
62 | obligated to maintain confidentiality with regard to the reporter of an
63 | incident.  Further details of specific enforcement policies may be posted
64 | separately.
65 | 
66 | Project maintainers who do not follow or enforce the Code of Conduct in good
67 | faith may face temporary or permanent repercussions as determined by other
68 | members of the project's leadership.
69 | 
70 | ## Attribution
71 | 
72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
73 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
74 | 
75 | [homepage]: https://www.contributor-covenant.org
76 | 
77 | For answers to common questions about this code of conduct, see
78 | https://www.contributor-covenant.org/faq
79 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
1 | Kumar Shivendu
2 | Siddharth Ravikumar
3 | Thibault Allançon
4 | Satvik Vemuganti
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile driver for SWH Python modules. DO NOT CHANGE.
2 | # You can add custom Makefile rules to Makefile.local
3 | 
4 | include ../Makefile.python
5 | -include Makefile.local
6 | 


--------------------------------------------------------------------------------
/Makefile.local:
--------------------------------------------------------------------------------
1 | TESTFLAGS += --hypothesis-profile=fast
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Software Heritage - Indexer
 2 | ===========================
 3 | 
 4 | Tools to compute multiple indexes on SWH's raw contents:
 5 | 
 6 | - content:
 7 | 
 8 |   - mimetype
 9 |   - fossology-license
10 |   - metadata
11 | 
12 | - origin:
13 | 
14 |   - metadata (intrinsic, using the content indexer; and extrinsic)
15 | 
16 | An indexer is in charge of:
17 | 
18 | - looking up objects
19 | - extracting information from those objects
20 | - store those information in the swh-indexer db
21 | 
22 | There are multiple indexers working on different object types:
23 | 
24 |   - content indexer: works with content sha1 hashes
25 |   - revision indexer: works with revision sha1 hashes
26 |   - origin indexer: works with origin identifiers
27 | 
28 | Indexation procedure:
29 | 
30 | - receive batch of ids
31 | - retrieve the associated data depending on object type
32 | - compute for that object some index
33 | - store the result to swh's storage
34 | 
35 | Current content indexers:
36 | 
37 | - mimetype (queue swh_indexer_content_mimetype): detect the encoding
38 |   and mimetype
39 | 
40 | - fossology-license (queue swh_indexer_fossology_license): compute the
41 |   license
42 | 
43 | - metadata: translate file from an ecosystem-specific formats to JSON-LD
44 |   (using schema.org/CodeMeta vocabulary)
45 | 
46 | Current origin indexers:
47 | 
48 | - metadata: translate file from an ecosystem-specific formats to JSON-LD
49 |   (using schema.org/CodeMeta and ForgeFed vocabularies)
50 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld",
 3 |   "@type": "SoftwareSourceCode",
 4 |   "identifier": "5682a72dc61f86ae69f2841c2184d6159c0b6d5d",
 5 |   "description": "Software Heritage Indexer for revisions and contents",
 6 |   "name": "swh-indexer",
 7 |   "isPartOf": {
 8 |     "@type": "SoftwareSourceCode",
 9 |     "name": "swh-environment",
10 |     "identifier": "83e766feafde91242883be1bf369ed3e6865824f"
11 |   },
12 |   "codeRepository": "https://forge.softwareheritage.org/diffusion/78/",
13 |   "issueTracker": "https://forge.softwareheritage.org/maniphest/",
14 |   "license": "https://spdx.org/licenses/GPL-3.0.html",
15 |   "version": "0.0.35",
16 |   "author": [
17 |     {
18 |       "@type": "Organization",
19 |       "name": "Software Heritage",
20 |       "url": "https://www.softwareheritage.org",
21 |       "email": "swh-devel@inria.fr"
22 |     }
23 |   ],
24 |   "developmentStatus": "active",
25 |   "keywords": [
26 |     "indexer",
27 |     "software",
28 |     "mimetype",
29 |     "ctags",
30 |     "language",
31 |     "fossology-license",
32 |     "metadata",
33 |     "metadata-detector",
34 |     "metadata-translator"
35 |   ],
36 |   "dateCreated":"2017-06-12",
37 |   "datePublished":"2017-06-12",
38 |   "programmingLanguage": "Python"
39 | }
40 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020-2025  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from hypothesis import settings
 7 | 
 8 | # define tests profile. Full documentation is at:
 9 | # https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
10 | settings.register_profile("fast", max_examples=5, deadline=5000)
11 | settings.register_profile("slow", max_examples=20, deadline=5000)
12 | 
13 | # Ignore the following modules because wsgi module fails as no
14 | # configuration file is found (--doctest-modules forces the module
15 | # loading)
16 | collect_ignore = ["swh/indexer/storage/api/wsgi.py"]
17 | 
18 | # we use the various swh fixtures
19 | pytest_plugins = [
20 |     "swh.journal.pytest_plugin",
21 |     "swh.storage.pytest_plugin",
22 | ]
23 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build/
2 | apidoc/
3 | *-stamp
4 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | -include Makefile.local
2 | include Makefile.sphinx
3 | 


--------------------------------------------------------------------------------
/docs/Makefile.local:
--------------------------------------------------------------------------------
 1 | sphinx/html: images
 2 | sphinx/clean: clean-images
 3 | assets: images
 4 | 
 5 | images:
 6 | 	make -C images/
 7 | clean-images:
 8 | 	make -C images/ clean
 9 | 
10 | .PHONY: images clean-images
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/_static/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/docs/_static/.placeholder


--------------------------------------------------------------------------------
/docs/_templates/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/docs/_templates/.placeholder


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
1 | .. _swh-indexer-cli:
2 | 
3 | Command-line interface
4 | ======================
5 | 
6 | .. click:: swh.indexer.cli:indexer_cli_group
7 |   :prog: swh indexer
8 |   :nested: full
9 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | from swh.docs.sphinx.conf import *  # NoQA
2 | 


--------------------------------------------------------------------------------
/docs/images/.gitignore:
--------------------------------------------------------------------------------
1 | *.svg
2 | 


--------------------------------------------------------------------------------
/docs/images/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | UML_DIAGS_SRC = $(wildcard *.uml)
 3 | UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC))
 4 | 
 5 | DOT_DIAGS_SRC = $(wildcard *.dot)
 6 | DOT_DIAGS = $(patsubst %.dot,%.svg,$(DOT_DIAGS_SRC))
 7 | 
 8 | all: $(UML_DIAGS) $(DOT_DIAGS)
 9 | 
10 | %.svg: %.uml
11 | 	DISPLAY="" plantuml -tsvg $<
12 | 
13 | %.svg: %.dot
14 | 	dot $< -T svg -o $@
15 | 
16 | clean:
17 | 	-rm -f $(DEP_GRAPHS) $(UML_DIAGS) $(DOT_DIAGS)
18 | 


--------------------------------------------------------------------------------
/docs/images/metadata-flow.dot:
--------------------------------------------------------------------------------
 1 | digraph metadata_flow {
 2 |     subgraph cluster_forges {
 3 |         style=invis;
 4 |         origin_vcs [label="Version Control Systems\n(Git, SVN, ...)"];
 5 |         origin_pm [label="Package Managers\n(NPM, PyPI, Debian, ...)"];
 6 |     }
 7 |     subgraph internet {
 8 |         rank=same;
 9 |         deposit_client [label="Deposit Clients\n(HAL, IPOL, eLife, Intel, ...)"];
10 |         registries [label="Registries\n(Wikidata, ...)"];
11 |     }
12 | 
13 |     subgraph cluster_SWH {
14 |         label="Software Heritage";
15 |         labeljust="r";
16 |         labelloc="b";
17 |         loader_vcs [label="VCS loader", shape="box"];
18 |         loader_pm [label="PM loader", shape="box"];
19 |         deposit_server [label="Deposit server", shape="box"];
20 |         indexer_extr [label="extrinsic metadata indexer\n(translate to Codemeta)", shape="box"];
21 |         indexer_intr [label="intrinsic metadata indexer\n(translate to Codemeta)", shape="box"];
22 |         registry_fetcher[label="?", style="dashed", shape="box"];
23 | 
24 |         storage [label="\nMain Storage\n(swh-storage and\nswh-objstorage)", shape=cylinder];
25 |         remd_storage [label="\nRaw Extrinsic\nMetadata Storage", shape=cylinder];
26 |         indexed_storage [label="\nIndexed\nMetadata Storage\n(search, idx-storage)", shape=cylinder];
27 | 
28 |         webapp [label="Web Interface", shape="box"];
29 |     }
30 | 
31 |     subgraph users {
32 |         browser [label="Web Browser", shape="box"]
33 |     }
34 | 
35 |     origin_vcs -> loader_vcs [label="pull"];
36 |     loader_vcs -> storage;
37 |     origin_pm -> loader_pm [label="pull"]
38 |     loader_pm -> {storage, remd_storage};
39 |     deposit_client -> deposit_server [label="push\n(SWORD + Codemeta)"];
40 |     deposit_server -> {storage, remd_storage};
41 | 
42 |     registries -> registry_fetcher -> remd_storage [style="dashed"];
43 | 
44 |     storage -> indexer_intr [label="all kinds of\nmetadata formats"];
45 |     indexer_intr -> indexed_storage [label="only Codemeta"];
46 |     remd_storage -> indexer_extr [label="all kinds of\nmetadata formats"];
47 |     indexer_extr-> indexed_storage;
48 | 
49 |     {storage, remd_storage, indexed_storage} -> webapp;
50 |     webapp -> browser [label="search, display,\nBibTeX export,\ndownload, ..."];
51 | }
52 | 


--------------------------------------------------------------------------------
/docs/images/tasks-extrinsic-metadata-indexers.uml:
--------------------------------------------------------------------------------
 1 | @startuml
 2 |   participant LOADERS as "Metadata Loaders"
 3 |   participant STORAGE as "Graph Storage"
 4 |   participant JOURNAL as "Journal"
 5 |   participant IDX_REM_META as "REM Indexer"
 6 |   participant IDX_STORAGE as "Indexer Storage"
 7 | 
 8 |   activate IDX_STORAGE
 9 |   activate STORAGE
10 |   activate JOURNAL
11 |   activate LOADERS
12 | 
13 |   LOADERS->>STORAGE: new REM (Raw Extrinsic Metadata) object\n for Origin http://example.org/repo.git\nor object swh:1:dir:...
14 |   STORAGE->>JOURNAL: new REM object
15 |   deactivate LOADERS
16 | 
17 |   JOURNAL->>IDX_REM_META: run indexers on REM object
18 |   activate IDX_REM_META
19 | 
20 |   IDX_REM_META->>IDX_REM_META: recognize REM object (gitea/github/deposit/...)
21 | 
22 |   IDX_REM_META->>IDX_REM_META: parse REM object
23 | 
24 |   alt If the REM object describe an origin
25 |     IDX_REM_META->>IDX_STORAGE: origin_extrinsic_metadata_add(id="http://example.org/repo.git", {author: "Jane Doe", ...})
26 |     IDX_STORAGE->>IDX_REM_META: ok
27 |   end
28 | 
29 |   alt If the REM object describe a directory
30 |     IDX_REM_META->>IDX_STORAGE: directory_extrinsic_metadata_add(id="swh:1:dir:...", {author: "Jane Doe", ...})
31 |     IDX_STORAGE->>IDX_REM_META: ok
32 |   end
33 | 
34 |   deactivate IDX_REM_META
35 | 
36 | 
37 | @enduml
38 | 


--------------------------------------------------------------------------------
/docs/images/tasks-intrinsic-metadata-indexers.uml:
--------------------------------------------------------------------------------
 1 | @startuml
 2 |   participant LOADERS as "Loaders"
 3 |   participant STORAGE as "Graph Storage"
 4 |   participant JOURNAL as "Journal"
 5 |   participant IDX_ORIG_META as "Origin Metadata Indexer"
 6 |   participant IDX_ORIG_HEAD as "Origin-Head Indexer"
 7 |   participant IDX_DIR_META as "Directory Metadata Indexer"
 8 |   participant IDX_CONT_META as "Content Metadata Indexer"
 9 |   participant IDX_STORAGE as "Indexer Storage"
10 |   participant OBJ_STORAGE as "Object Storage"
11 | 
12 |   activate OBJ_STORAGE
13 |   activate IDX_STORAGE
14 |   activate STORAGE
15 |   activate JOURNAL
16 |   activate IDX_ORIG_META
17 | 
18 |   activate LOADERS
19 | 
20 |   LOADERS->>STORAGE: Repository content
21 |   LOADERS->>STORAGE: Origin http://example.org/repo.git\nwas added/revisited
22 |   STORAGE->>JOURNAL: Origin http://example.org/repo.git\nwas added/revisited
23 |   deactivate LOADERS
24 | 
25 |   JOURNAL->>IDX_ORIG_META: run indexers on origin\nhttp://example.org/repo.git
26 | 
27 |   IDX_ORIG_META->>IDX_ORIG_HEAD: Find HEAD revision of\nhttp://example.org/repo.git
28 |   activate IDX_ORIG_HEAD
29 | 
30 |   IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin="http://example.org/repo.git")
31 | 
32 |   STORAGE->>IDX_ORIG_HEAD: branches
33 | 
34 |   IDX_ORIG_HEAD->>IDX_ORIG_META: run Revision Metadata Indexer\non revision 42abcdef (head of origin\nhttp://example.org/repo.git)
35 |   deactivate IDX_ORIG_HEAD
36 | 
37 |   IDX_ORIG_META->>STORAGE: revision_get(sha1=42abcdef)
38 |   STORAGE->>IDX_ORIG_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...}
39 | 
40 |   IDX_ORIG_META->>IDX_DIR_META: Index directory 456789ab\n(head of origin http://example.org/repo.git)
41 |   activate IDX_DIR_META
42 | 
43 |   IDX_DIR_META->>STORAGE: directory_ls(sha1=456789ab)
44 |   STORAGE->>IDX_DIR_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...]
45 | 
46 |   IDX_DIR_META->>IDX_DIR_META: package.json is a metadata file
47 | 
48 |   IDX_DIR_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe)
49 |   IDX_STORAGE->>IDX_DIR_META: none / {author: "Jane Doe", ...}
50 | 
51 |   alt If the storage answered "none"
52 |     IDX_DIR_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file
53 |     activate IDX_CONT_META
54 | 
55 |     IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe
56 | 
57 |     OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...'
58 | 
59 |     IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author
60 | 
61 |     IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...})
62 |     IDX_STORAGE->>IDX_CONT_META: ok
63 | 
64 |     IDX_CONT_META->>IDX_DIR_META: extracted: {author: "Jane Doe", ...}
65 |     deactivate IDX_CONT_META
66 | 
67 |     IDX_DIR_META->>IDX_STORAGE: directory_metadata_add(sha1=456789ab, {author: "Jane Doe", ...})
68 |     IDX_STORAGE->>IDX_DIR_META: ok
69 |   end
70 | 
71 |   IDX_DIR_META->>IDX_ORIG_META: extracted: {author: "Jane Doe", ...}
72 |   deactivate IDX_DIR_META
73 | 
74 |   IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id="http://example.org/repo.git", {author: "Jane Doe", ...}, from_directory=456789ab)
75 |   IDX_STORAGE->>IDX_ORIG_META: ok
76 |   deactivate IDX_ORIG_META
77 | 
78 | 
79 | @enduml
80 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. _swh-indexer:
 2 | 
 3 | .. include:: README.rst
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 1
 7 |    :caption: Contents:
 8 | 
 9 |    README.md
10 |    metadata-workflow.rst
11 |    swhpkg.rst
12 | 
13 | 
14 | Reference Documentation
15 | -----------------------
16 | 
17 | .. toctree::
18 |    :maxdepth: 2
19 | 
20 |    cli
21 | 
22 | .. only:: standalone_package_doc
23 | 
24 |    Indices and tables
25 |    ------------------
26 | 
27 |    * :ref:`genindex`
28 |    * :ref:`modindex`
29 |    * :ref:`search`
30 | 


--------------------------------------------------------------------------------
/docs/swhpkg.rst:
--------------------------------------------------------------------------------
  1 | SwhPkg Vocabulary
  2 | ================================
  3 | 
  4 | .. note:: This is an early draft and hasn't been implemented yet
  5 | 
  6 | 
  7 | SwhPkg is a vocabulary that complements ontologies like schema.org and CodeMeta
  8 | in describing software projects. While the latter are meant to describe
  9 | source code projects, SwhPkg describes relationships between different packages released
 10 | by such projects.
 11 | 
 12 | The namespace is ``https://www.softwareheritage.org/schema/2023/packages/``;
 13 | and it is meant to be used primarily alongside CodeMeta/schema.org
 14 | and ForgeFed/ActivityStreams.
 15 | 
 16 | 
 17 | The following prefixes are used throughout this document for readability:
 18 | 
 19 | .. code-block:: json
 20 | 
 21 |     {
 22 |         "schema": "http://schema.org/",
 23 |         "codemeta": "https://codemeta.github.io/terms/",
 24 |         "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/",
 25 |         "swhpackages": "https://archive.softwareheritage.org/packages/",
 26 |     }
 27 | 
 28 | For example, here is a document using all three together:
 29 | 
 30 | .. code-block:: json
 31 | 
 32 |     {
 33 |       "@context": {
 34 |         "schema": "http://schema.org/",
 35 |         "codemeta": "https://codemeta.github.io/terms/",
 36 |         "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/",
 37 |         "swhpackages": "https://archive.softwareheritage.org/packages/",
 38 |         "package": {"@id": "swhpkg:package", "@type": "@id"},
 39 |         "release": {"@id": "swhpkg:release", "@type": "@id"},
 40 |         "dependencies": {"@id": "swhpkg:dependencies"},
 41 |         "dependency": {"@id": "swhpkg:dependency", "@type": "@id"},
 42 |         "dependent": {"@id": "swhpkg:dependent", "@type": "@id"},
 43 |         "kind": {"@id": "swhpkg:kind"},
 44 |         "optional": {"@id": "swhpkg:optional"}
 45 |       },
 46 |       "@type": "schema:SoftwareSourceCode",
 47 |       "@id": "https://npmjs.com/package/d3@7.8.2",
 48 |       "package": "swhpackages:js/d3",
 49 |       "release": "swhpackages:js/d3@7.8.2",
 50 |       "schema:name": "d3",
 51 |       "schema:version": "7.8.2",
 52 |       "schema:description": "Data-Driven Documents",
 53 |       "dependencies": [
 54 |         {
 55 |           "@type": "swhpkg:dependencies",
 56 |           "@id": "swhpackages:js/d3@7.8.2#d3-array",
 57 |           "dependent": "swhpackages:js/d3@7.8.2",
 58 |           "dependency": "swhpackages:js/d3-array",
 59 |           "constraint": "^3.0.0",
 60 |           "kind": "runtime",
 61 |           "optional": false
 62 |         },
 63 |         {
 64 |           "@type": "swhpkg:dependencies",
 65 |           "@id": "swhpackages:js/d3@7.8.2#mocha",
 66 |           "dependent": "swhpackages:js/d3@7.8.2",
 67 |           "dependency": "swhpackages:js/mocha",
 68 |           "constraint": ">10.0.0",
 69 |           "kind": "development",
 70 |           "optional": true
 71 |         }
 72 |       ]
 73 |     }
 74 | 
 75 | SwhPkg Terms
 76 | -------------
 77 | 
 78 | .. list-table::
 79 |    :header-rows: 1
 80 | 
 81 |    * - Property
 82 |      - Type
 83 |      - Examples
 84 |      - Description
 85 |    * - ``package``
 86 |      - ``swhpkg:package``
 87 |      - ``swhpackages:js/d3``, ``swhpackages:python/numpy``
 88 |      - Package that is released by the SoftwareSourceCode/SofwtareApplication.
 89 |    * - ``release``
 90 |      - ``swhpkg:release``
 91 |      - ``swhpackages:js/d3@7.8.2``, ``swhpackages:python/numpy@1.24.2``
 92 |      - Specific version of the package that is released by the SoftwareSourceCode/SoftwareApplication
 93 |    * - ``dependencies``
 94 |      - ``swhpkg:dependencies``
 95 |      - d3 depends on d3-array and mocha.
 96 |      - Dependencies of the project. There can be many of them.
 97 |    * - ``dependent``
 98 |      - ``swhpkg:release``
 99 |      - ``swhpkg:js/d3``
100 |      - A reference to the package release that depends on the dependency.
101 |    * - ``dependency``
102 |      - ``swhpkg:package``
103 |      - ``swhpackages:js/d3``, ``swhpackages:python/django``
104 |      - A reference to the package that is depended on.
105 |    * - ``constraint``
106 |      - Text
107 |      - ``^3.0.0``, ``>10.0.0``
108 |      - The constraint on a dependency relation. It can be a version range, or a git commit hash, or even a file path.
109 |    * - ``kind``
110 |      - Text
111 |      - ``runtime``, ``development``
112 |      - The type of dependency relation. Some common values are ``runtime``, ``development``.
113 |    * - ``optional``
114 |      - boolean
115 |      - ``true``, ``false``
116 |      - Whether the dependency is optional or not.
117 | 
118 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "swh.indexer"
 3 | authors = [
 4 |     {name="Software Heritage developers", email="swh-devel@inria.fr"},
 5 | ]
 6 | 
 7 | description = "Software Heritage indexer"
 8 | readme = {file = "README.rst", content-type = "text/x-rst"}
 9 | requires-python = ">=3.9"
10 | classifiers = [
11 |     "Programming Language :: Python :: 3",
12 |     "Intended Audience :: Developers",
13 |     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
14 |     "Operating System :: OS Independent",
15 |     "Development Status :: 5 - Production/Stable",
16 | ]
17 | dynamic = ["version", "dependencies", "optional-dependencies"]
18 | 
19 | [tool.setuptools.packages.find]
20 | include = ["swh.*"]
21 | 
22 | [tool.setuptools.dynamic]
23 | dependencies = {file = ["requirements.txt", "requirements-swh.txt"]}
24 | 
25 | [tool.setuptools.dynamic.optional-dependencies]
26 | testing = {file = ["requirements-test.txt"]}
27 | 
28 | [project.entry-points."swh.cli.subcommands"]
29 | "swh.indexer" = "swh.indexer.cli"
30 | 
31 | [project.entry-points."swh.indexer_storage.classes"]
32 | "postgresql" = "swh.indexer.storage:IndexerStorage"
33 | "remote" = "swh.indexer.storage.api.client:RemoteStorage"
34 | "memory" = "swh.indexer.storage.in_memory:IndexerStorage"
35 | 
36 | [project.urls]
37 | "Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-indexer"
38 | "Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-indexer/-/issues"
39 | "Funding" = "https://www.softwareheritage.org/donate"
40 | "Documentation" = "https://docs.softwareheritage.org/devel/swh-indexer/"
41 | "Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-indexer.git"
42 | 
43 | [build-system]
44 | requires = ["setuptools", "setuptools-scm"]
45 | build-backend = "setuptools.build_meta"
46 | 
47 | [tool.setuptools_scm]
48 | fallback_version = "0.0.1"
49 | 
50 | [tool.black]
51 | target-version = ['py39', 'py310', 'py311', 'py312']
52 | 
53 | [tool.isort]
54 | multi_line_output = 3
55 | include_trailing_comma = true
56 | force_grid_wrap = 0
57 | use_parentheses = true
58 | ensure_newline_before_comments = true
59 | line_length = 88
60 | force_sort_within_sections = true
61 | known_first_party = ['swh']
62 | 
63 | [tool.mypy]
64 | namespace_packages = true
65 | warn_unused_ignores = true
66 | explicit_package_bases = true
67 | # ^ Needed for mypy to detect py.typed from swh packages installed
68 | # in editable mode
69 | 
70 | plugins = []
71 | 
72 | # 3rd party libraries without stubs (yet)
73 | [[tool.mypy.overrides]]
74 | module = [
75 |     "pybtex.*",
76 |     "pyld.*",
77 | ]
78 | ignore_missing_imports = true
79 | 
80 | [tool.flake8]
81 | select = ["C", "E", "F", "W", "B950"]
82 | ignore = [
83 |     "E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
84 |     "E231", # missing whitespace after ','
85 |     "E501", # line too long, use B950 warning from flake8-bugbear instead
86 |     "W503" # line break before binary operator <https://github.com/psf/black/issues/52>
87 | ]
88 | max-line-length = 88
89 | 
90 | [tool.pytest.ini_options]
91 | norecursedirs = "build docs .*"
92 | asyncio_mode = "strict"
93 | consider_namespace_packages = true
94 | 


--------------------------------------------------------------------------------
/requirements-swh.txt:
--------------------------------------------------------------------------------
1 | swh.core[db,http] >= 4.0.0
2 | swh.model >= 6.13.0
3 | swh.objstorage >= 2.3.1
4 | swh.storage >= 3.0.0
5 | swh.journal >= 0.1.0
6 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
 1 | confluent-kafka
 2 | hypothesis >= 3.11.0
 3 | pytest >= 8.1
 4 | pytest-mock
 5 | swh.core[testing] >= 3.0.0
 6 | swh.journal[pytest] >= 2.0.0
 7 | swh.storage[pytest] >= 3.1.0
 8 | 
 9 | types-click
10 | types-confluent-kafka
11 | types-pyyaml
12 | types-xmltodict
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | python-magic >= 0.4.13
 2 | click
 3 | # frozendict: dependency of pyld
 4 | # the version 2.1.2 is causing segmentation faults
 5 | # cf https://forge.softwareheritage.org/T3815
 6 | frozendict != 2.1.2
 7 | iso8601
 8 | # use upstream pybtex that removed pkg_resources use until a new release
 9 | pybtex @ git+https://bitbucket.org/pybtex-devs/pybtex.git@9b97822
10 | pyld
11 | rdflib >= 7.1.4  # first version with this patch: https://github.com/RDFLib/rdflib/pull/3011
12 | sentry-sdk
13 | typing-extensions
14 | xmltodict
15 | 


--------------------------------------------------------------------------------
/sql/bin/db-upgrade:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Compute a draft upgrade script for the DB schema, based on Git revisions.
 4 | 
 5 | # Depends: apgdiff
 6 | 
 7 | set -e
 8 | 
 9 | SQLS="swh-*.sql"
10 | VERSION_SQL="swh-schema.sql"
11 | UPGRADE_DIR="upgrades"
12 | DB_NAME="softwareheritage-dev"
13 | 
14 | usage () {
15 |     echo "Usage: db-upgrade GIT_REV_FROM [GIT_REV_TO]"
16 |     echo "Example: db-upgrade HEAD^"
17 |     echo "         db-upgrade HEAD~4 HEAD~2"
18 |     echo "See also: gitrevisions(7)"
19 |     exit 1
20 | }
21 | 
22 | pg_dump_revision () {
23 |     rev="$1"
24 |     dump="$2"
25 | 
26 |     echo "checking out revision $rev, and dumping DB at the time..."
27 |     if [ "$rev" != "HEAD" ] ; then
28 | 	git checkout --quiet "$rev"
29 |     fi
30 |     make distclean filldb > /dev/null
31 |     pg_dump "$DB_NAME" > "$dump"
32 |     if [ "$rev" != "HEAD" ] ; then
33 | 	git checkout --quiet -
34 |     fi
35 | }
36 | 
37 | # argument parsing
38 | if [ -z "$1" ] ; then
39 |     usage
40 | fi
41 | from_rev="$1"
42 | shift 1
43 | if [ -z "$1" ] ; then
44 |     to_rev="HEAD"
45 | else
46 |     to_rev="$1"
47 |     shift 1
48 | fi
49 | 
50 | old_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX)
51 | new_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX)
52 | trap "rm -f $old_dump $new_dump" EXIT
53 | 
54 | schema_version=$(grep -i -A 1 '^insert into dbversion' "$VERSION_SQL" | tail -n 1 \
55 | 	      | sed -e 's/.*values(//i' -e 's/,.*//')
56 | upgrade_script=$(mktemp -p "$UPGRADE_DIR" $(printf '%.03d' ${schema_version}).XXXX.sql)
57 | pg_dump_revision "$from_rev" "$old_dump"
58 | pg_dump_revision "$to_rev" "$new_dump"
59 | 
60 | cat > "$upgrade_script" <<EOF
61 | -- SWH DB schema upgrade
62 | -- from_version: XXX TODO
63 | -- to_version: ${schema_version}
64 | -- description: XXX TODO
65 | 
66 | insert into dbversion(version, release, description)
67 |       values($schema_version, now(), 'Work In Progress');
68 | EOF
69 | echo "diffing dumps..."
70 | apgdiff "$old_dump" "$new_dump" >> "$upgrade_script"
71 | 
72 | echo "all done."
73 | echo "Draft upgrade script is at: ${upgrade_script}"
74 | 


--------------------------------------------------------------------------------
/sql/bin/dot_add_content:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DOT_FILE="$1"
 4 | DOT_EXTRA="$2"
 5 | if [ -z "$DOT_FILE" -o -z "$DOT_EXTRA" ] ; then
 6 |     echo "Usage: $0 DOT_FILE DOT_EXTRA"
 7 |     exit 1
 8 | fi
 9 | 
10 | schema_version=$(grep -i -A 1 '^insert into dbversion' swh-schema.sql | tail -n 1 \
11 | 	      | sed -e 's/.*values(//i' -e 's/,.*//')
12 | 
13 | head -n -1 "$DOT_FILE"  # all of $DOT_FILE but last line
14 | sed "s/@@VERSION@@/$schema_version/" "$DOT_EXTRA"
15 | echo "}"
16 | 


--------------------------------------------------------------------------------
/sql/doc/json:
--------------------------------------------------------------------------------
1 | ../json


--------------------------------------------------------------------------------
/sql/doc/sql:
--------------------------------------------------------------------------------
1 | ../autodoc


--------------------------------------------------------------------------------
/sql/json/.gitignore:
--------------------------------------------------------------------------------
1 | *-stamp
2 | 


--------------------------------------------------------------------------------
/sql/json/Makefile:
--------------------------------------------------------------------------------
 1 | # Depends: json-glib-tools
 2 | 
 3 | JSONVAL = json-glib-validate
 4 | JSONS = $(wildcard *.json)
 5 | 
 6 | all: validate
 7 | check: validate
 8 | test: validate
 9 | 
10 | validate: validate-stamp
11 | validate-stamp: $(JSONS)
12 | 	make $(patsubst %,validate/%,$?)
13 | 	touch $@
14 | 
15 | validate/%:
16 | 	$(JSONVAL) $*
17 | 
18 | clean:
19 | 	rm -f validate-stamp
20 | 


--------------------------------------------------------------------------------
/sql/json/indexer_configuration.tool_configuration.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/schema#",
 3 |     "id": "http://softwareheritage.org/schemas/indexer_configuration.tool_configuration.schema.json",
 4 | 
 5 |     "type": "object",
 6 |     "properties": {
 7 |         "command_line": {
 8 |             "type": "string"
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/sql/json/revision_metadata.translated_metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/schema#",
 3 |     "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json",
 4 | 
 5 |     "type": "object",
 6 |     "properties": {
 7 |         "developmentStatus": {
 8 |             "type": "list"
 9 |           },
10 |          "version": {
11 |             "type": "list"
12 |           },
13 |           "operatingSystem": {
14 |               "type": "list"
15 |           },
16 |           "description": {
17 |               "type": "list"
18 |           },
19 |           "keywords": {
20 |               "type": "list"
21 |           },
22 |           "issueTracker": {
23 |               "type": "list"
24 |           },
25 |           "name": {
26 |                 "type": "list"
27 |           },
28 |           "author": {
29 |               "type": "list"
30 |           },
31 |           "relatedLink": {
32 |               "type": "list"
33 |           },
34 |           "url": {
35 |               "type": "list"
36 |           },
37 |           "license": {
38 |               "type": "list"
39 |           },
40 |           "maintainer": {
41 |               "type": "list"
42 |           },
43 |           "email": {
44 |               "type": "list"
45 |           },
46 |           "softwareRequirements": {
47 |               "type": "list"
48 |           },
49 |           "identifier": {
50 |               "type": "list"
51 |           },
52 |           "codeRepository": {
53 |               "type": "list"
54 |           }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/swh/indexer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2016-2023  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | 
 7 | # implemented as a function to help lazy loading
 8 | def get_datastore(*args, **kw):
 9 |     from .indexer import get_indexer_storage
10 | 
11 |     return get_indexer_storage(*args, **kw)
12 | 
13 | 
14 | default_cfg = {
15 |     "default_interval": "1 day",
16 |     "min_interval": "12 hours",
17 |     "max_interval": "1 day",
18 |     "backoff_factor": 2,
19 |     "max_queue_length": 5000,
20 | }
21 | 


--------------------------------------------------------------------------------
/swh/indexer/data/Gitea.csv:
--------------------------------------------------------------------------------
 1 | Property,Gitea
 2 | codeRepository,clone_url
 3 | programmingLanguage,languages
 4 | runtimePlatform,
 5 | targetProduct,
 6 | applicationCategory,
 7 | applicationSubCategory,
 8 | downloadUrl,
 9 | fileSize,
10 | installUrl,
11 | memoryRequirements,
12 | operatingSystem,
13 | permissions,
14 | processorRequirements,
15 | releaseNotes,
16 | softwareHelp,
17 | softwareRequirements,
18 | softwareVersion,
19 | storageRequirements,
20 | supportingData,
21 | author,owner
22 | citation,
23 | contributor,
24 | copyrightHolder,
25 | copyrightYear,
26 | dateCreated,created_at
27 | dateModified,updated_at
28 | datePublished,
29 | editor,
30 | encoding,
31 | fileFormat,
32 | funder,
33 | keywords,
34 | license,
35 | producer,
36 | provider,
37 | publisher,
38 | sponsor,
39 | version,
40 | isAccessibleForFree,
41 | isPartOf,
42 | hasPart,
43 | position,
44 | description,description
45 | identifier,
46 | name,name
47 | sameAs,
48 | url,website
49 | relatedLink,
50 | givenName,
51 | familyName,
52 | email,
53 | affiliation,
54 | identifier,
55 | name,name
56 | address,
57 | type,
58 | id,
59 | softwareSuggestions,
60 | maintainer,
61 | contIntegration,
62 | buildInstructions,
63 | developmentStatus,
64 | embargoDate,
65 | funding,
66 | issueTracker,
67 | referencePublication,
68 | readme,
69 | 


--------------------------------------------------------------------------------
/swh/indexer/data/codemeta/CITATION:
--------------------------------------------------------------------------------
1 | Matthew B. Jones, Carl Boettiger, Abby Cabunoc Mayes, Arfon Smith, Peter Slaughter, Kyle Niemeyer, Yolanda Gil, Martin Fenner, Krzysztof Nowak, Mark Hahnel, Luke Coy, Alice Allen, Mercè Crosas, Ashley Sands, Neil Chue Hong, Patricia Cruse, Daniel S. Katz, Carole Goble. 2017. CodeMeta: an exchange schema for software metadata. Version 2.0. KNB Data Repository. doi:10.5063/schema/codemeta-2.0
2 | swh:1:dir:f39a0ef0005ad0dee50dcd546231ed568cf8705d;origin=https://github.com/codemeta/codemeta
3 | 


--------------------------------------------------------------------------------
/swh/indexer/data/codemeta/codemeta-2.0.jsonld:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": {
 3 |       "type": "@type",
 4 |       "id": "@id",
 5 |       "schema":"http://schema.org/",
 6 |       "codemeta": "https://codemeta.github.io/terms/",
 7 |       "Organization": {"@id": "schema:Organization"},
 8 |       "Person": {"@id": "schema:Person"},
 9 |       "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"},
10 |       "SoftwareApplication": {"@id": "schema:SoftwareApplication"},
11 |       "Text": {"@id": "schema:Text"},
12 |       "URL": {"@id": "schema:URL"},
13 |       "address": { "@id": "schema:address"},
14 |       "affiliation": { "@id": "schema:affiliation"},
15 |       "applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"},
16 |       "applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"},
17 |       "citation": { "@id": "schema:citation"},
18 |       "codeRepository": { "@id": "schema:codeRepository", "@type": "@id"},
19 |       "contributor": { "@id": "schema:contributor"},
20 |       "copyrightHolder": { "@id": "schema:copyrightHolder"},
21 |       "copyrightYear": { "@id": "schema:copyrightYear"},
22 |       "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" },
23 |       "dateModified":  {"@id": "schema:dateModified", "@type": "schema:Date" },
24 |       "datePublished":  {"@id": "schema:datePublished", "@type": "schema:Date" },
25 |       "description": { "@id": "schema:description"},
26 |       "downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"},
27 |       "email": { "@id": "schema:email"},
28 |       "editor": { "@id": "schema:editor"},
29 |       "encoding": { "@id": "schema:encoding"},
30 |       "familyName": { "@id": "schema:familyName"},
31 |       "fileFormat": { "@id": "schema:fileFormat", "@type": "@id"},
32 |       "fileSize": { "@id": "schema:fileSize"},
33 |       "funder": { "@id": "schema:funder"},
34 |       "givenName": { "@id": "schema:givenName"},
35 |       "hasPart": { "@id": "schema:hasPart" },
36 |       "identifier": { "@id": "schema:identifier", "@type": "@id"},
37 |       "installUrl": { "@id": "schema:installUrl", "@type": "@id"},
38 |       "isAccessibleForFree": { "@id": "schema:isAccessibleForFree"},
39 |       "isPartOf":  { "@id": "schema:isPartOf"},
40 |       "keywords": { "@id": "schema:keywords"},
41 |       "license": { "@id": "schema:license", "@type": "@id"},
42 |       "memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"},
43 |       "name": { "@id": "schema:name"},
44 |       "operatingSystem": { "@id": "schema:operatingSystem"},
45 |       "permissions": { "@id": "schema:permissions"},
46 |       "position": { "@id": "schema:position"},
47 |       "processorRequirements": { "@id": "schema:processorRequirements"},
48 |       "producer": { "@id": "schema:producer"},
49 |       "programmingLanguage": { "@id": "schema:programmingLanguage"},
50 |       "provider": { "@id": "schema:provider"},
51 |       "publisher": { "@id": "schema:publisher"},
52 |       "relatedLink": { "@id": "schema:relatedLink", "@type": "@id"},
53 |       "releaseNotes": { "@id": "schema:releaseNotes", "@type": "@id"},
54 |       "runtimePlatform": { "@id": "schema:runtimePlatform"},
55 |       "sameAs": { "@id": "schema:sameAs", "@type": "@id"},
56 |       "softwareHelp": { "@id": "schema:softwareHelp"},
57 |       "softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"},
58 |       "softwareVersion": { "@id": "schema:softwareVersion"},
59 |       "sponsor": { "@id": "schema:sponsor"},
60 |       "storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"},
61 |       "supportingData": { "@id": "schema:supportingData"},
62 |       "targetProduct": { "@id": "schema:targetProduct"},
63 |       "url": { "@id": "schema:url", "@type": "@id"},
64 |       "version": { "@id": "schema:version"},
65 | 
66 |       "author": { "@id": "schema:author", "@container": "@list" },
67 | 
68 |       "softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"},
69 |       "contIntegration": { "@id": "codemeta:contIntegration", "@type": "@id"},
70 |       "buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"},
71 |       "developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"},
72 |       "embargoDate": { "@id":"codemeta:embargoDate", "@type": "schema:Date" },
73 |       "funding": { "@id": "codemeta:funding" },
74 |       "readme": { "@id":"codemeta:readme", "@type": "@id" },
75 |       "issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" },
76 |       "referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"},
77 |       "maintainer": { "@id": "codemeta:maintainer" }
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/swh/indexer/data/codemeta/codemeta-3.0.jsonld:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": {
 3 |       "type": "@type",
 4 |       "id": "@id",
 5 |       "schema":"http://schema.org/",
 6 |       "codemeta": "https://codemeta.github.io/terms/",
 7 |       "Organization": {"@id": "schema:Organization"},
 8 |       "Person": {"@id": "schema:Person"},
 9 |       "Review": {"@id": "schema:Review"},
10 |       "Role": {"@id": "schema:Role"},
11 |       "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"},
12 |       "SoftwareApplication": {"@id": "schema:SoftwareApplication"},
13 |       "Text": {"@id": "schema:Text"},
14 |       "URL": {"@id": "schema:URL"},
15 |       "address": { "@id": "schema:address"},
16 |       "affiliation": { "@id": "schema:affiliation"},
17 |       "applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"},
18 |       "applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"},
19 |       "citation": { "@id": "schema:citation"},
20 |       "codeRepository": { "@id": "schema:codeRepository", "@type": "@id"},
21 |       "contributor": { "@id": "schema:contributor"},
22 |       "copyrightHolder": { "@id": "schema:copyrightHolder"},
23 |       "copyrightYear": { "@id": "schema:copyrightYear"},
24 |       "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" },
25 |       "dateModified":  {"@id": "schema:dateModified", "@type": "schema:Date" },
26 |       "datePublished":  {"@id": "schema:datePublished", "@type": "schema:Date" },
27 |       "description": { "@id": "schema:description"},
28 |       "downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"},
29 |       "email": { "@id": "schema:email"},
30 |       "editor": { "@id": "schema:editor"},
31 |       "encoding": { "@id": "schema:encoding"},
32 |       "endDate": { "@id": "schema:endDate"},
33 |       "familyName": { "@id": "schema:familyName"},
34 |       "fileFormat": { "@id": "schema:fileFormat", "@type": "@id"},
35 |       "fileSize": { "@id": "schema:fileSize"},
36 |       "funder": { "@id": "schema:funder"},
37 |       "givenName": { "@id": "schema:givenName"},
38 |       "hasPart": { "@id": "schema:hasPart" },
39 |       "identifier": { "@id": "schema:identifier", "@type": "@id"},
40 |       "installUrl": { "@id": "schema:installUrl", "@type": "@id"},
41 |       "isAccessibleForFree": { "@id": "schema:isAccessibleForFree"},
42 |       "isPartOf":  { "@id": "schema:isPartOf"},
43 |       "keywords": { "@id": "schema:keywords"},
44 |       "license": { "@id": "schema:license", "@type": "@id"},
45 |       "memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"},
46 |       "name": { "@id": "schema:name"},
47 |       "operatingSystem": { "@id": "schema:operatingSystem"},
48 |       "permissions": { "@id": "schema:permissions"},
49 |       "position": { "@id": "schema:position"},
50 |       "processorRequirements": { "@id": "schema:processorRequirements"},
51 |       "producer": { "@id": "schema:producer"},
52 |       "programmingLanguage": { "@id": "schema:programmingLanguage"},
53 |       "provider": { "@id": "schema:provider"},
54 |       "publisher": { "@id": "schema:publisher"},
55 |       "relatedLink": { "@id": "schema:relatedLink", "@type": "@id"},
56 |       "review": { "@id": "schema:review", "@type": "@id" },
57 |       "reviewAspect": { "@id": "schema:reviewAspect" },
58 |       "reviewBody": { "@id": "schema:reviewBody" },
59 |       "releaseNotes": { "@id": "schema:releaseNotes"},
60 |       "roleName": { "@id": "schema:roleName"},
61 |       "runtimePlatform": { "@id": "schema:runtimePlatform"},
62 |       "sameAs": { "@id": "schema:sameAs", "@type": "@id"},
63 |       "softwareHelp": { "@id": "schema:softwareHelp"},
64 |       "softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"},
65 |       "softwareVersion": { "@id": "schema:softwareVersion"},
66 |       "sponsor": { "@id": "schema:sponsor"},
67 |       "startDate": { "@id": "schema:startDate"},
68 |       "storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"},
69 |       "supportingData": { "@id": "schema:supportingData"},
70 |       "targetProduct": { "@id": "schema:targetProduct"},
71 |       "url": { "@id": "schema:url", "@type": "@id"},
72 |       "version": { "@id": "schema:version"},
73 |       "author": { "@id": "schema:author", "@container": "@list" },
74 | 
75 |       "softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"},
76 |       "continuousIntegration": { "@id": "codemeta:continuousIntegration", "@type": "@id"},
77 |       "buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"},
78 |       "developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"},
79 |       "embargoEndDate": { "@id":"codemeta:embargoEndDate", "@type": "schema:Date" },
80 |       "funding": { "@id": "codemeta:funding" },
81 |       "readme": { "@id":"codemeta:readme", "@type": "@id" },
82 |       "issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" },
83 |       "referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"},
84 |       "maintainer": { "@id": "codemeta:maintainer" },
85 |       "hasSourceCode": { "@id": "codemeta:hasSourceCode", "@type": "@id"},
86 |       "isSourceCodeOf": { "@id": "codemeta:isSourceCodeOf", "@type": "@id"}
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/swh/indexer/data/composer.csv:
--------------------------------------------------------------------------------
 1 | Property,Composer
 2 | codeRepository,support.source
 3 | programmingLanguage,
 4 | runtimePlatform,
 5 | targetProduct,
 6 | applicationCategory,
 7 | applicationSubCategory,
 8 | downloadUrl,
 9 | fileSize,
10 | installUrl,
11 | memoryRequirements,
12 | operatingSystem,
13 | permissions,
14 | processorRequirements,
15 | releaseNotes,
16 | softwareHelp,
17 | softwareRequirements,require
18 | softwareVersion,version
19 | storageRequirements,
20 | supportingData,
21 | author,authors
22 | citation,
23 | contributor,
24 | copyrightHolder,
25 | copyrightYear,
26 | dateCreated,
27 | dateModified,
28 | datePublished,
29 | editor,
30 | encoding,
31 | fileFormat,
32 | funder,
33 | keywords,keywords
34 | license,license
35 | producer,
36 | provider,
37 | publisher,
38 | sponsor,
39 | version,version
40 | isAccessibleForFree,
41 | isPartOf,
42 | hasPart,
43 | position,
44 | description,description
45 | identifier,name
46 | name,name
47 | sameAs,
48 | url,homepage
49 | relatedLink,
50 | givenName,
51 | familyName,
52 | email,author.email
53 | affiliation,
54 | identifier,
55 | name,author.name
56 | address,
57 | type,
58 | id,
59 | softwareSuggestions,suggest
60 | maintainer,
61 | contIntegration,
62 | buildInstructions,
63 | developmentStatus,
64 | embargoDate,
65 | funding,
66 | issueTracker,support.issues
67 | referencePublication,
68 | readme,


--------------------------------------------------------------------------------
/swh/indexer/data/nuget.csv:
--------------------------------------------------------------------------------
 1 | Property,NuGet
 2 | codeRepository,repository.url
 3 | programmingLanguage,
 4 | runtimePlatform,
 5 | targetProduct,
 6 | applicationCategory,
 7 | applicationSubCategory,
 8 | downloadUrl,
 9 | fileSize,
10 | installUrl,
11 | memoryRequirements,
12 | operatingSystem,
13 | permissions,
14 | processorRequirements,
15 | releaseNotes,releaseNotes
16 | softwareHelp,
17 | softwareRequirements,
18 | softwareVersion,
19 | storageRequirements,
20 | supportingData,
21 | author,authors
22 | citation,
23 | contributor,
24 | copyrightHolder,
25 | copyrightYear,
26 | dateCreated,
27 | dateModified,
28 | datePublished,
29 | editor,
30 | encoding,
31 | fileFormat,
32 | funder,
33 | keywords,tags
34 | license,license/licenseUrl
35 | producer,
36 | provider,
37 | publisher,
38 | sponsor,
39 | version,version
40 | isAccessibleForFree,
41 | isPartOf,
42 | hasPart,
43 | position,
44 | description,description/summary
45 | identifier,
46 | name,name
47 | sameAs,
48 | url,projectUrl
49 | relatedLink,
50 | givenName,
51 | familyName,
52 | email,
53 | affiliation,
54 | identifier,id
55 | name,
56 | address,
57 | type,
58 | id,
59 | softwareSuggestions,
60 | maintainer,
61 | contIntegration,
62 | buildInstructions,
63 | developmentStatus,
64 | embargoDate,
65 | funding,
66 | issueTracker,
67 | referencePublication,
68 | readme,
69 | 


--------------------------------------------------------------------------------
/swh/indexer/data/pubspec.csv:
--------------------------------------------------------------------------------
 1 | Property,Pubspec
 2 | codeRepository,repository
 3 | programmingLanguage,
 4 | runtimePlatform,platforms
 5 | targetProduct,
 6 | applicationCategory,
 7 | applicationSubCategory,
 8 | downloadUrl,
 9 | fileSize,
10 | installUrl,
11 | memoryRequirements,
12 | operatingSystem,
13 | permissions,
14 | processorRequirements,
15 | releaseNotes,
16 | softwareHelp,
17 | softwareRequirements,
18 | softwareVersion,version
19 | storageRequirements,
20 | supportingData,
21 | author,author/authors
22 | citation,
23 | contributor,
24 | copyrightHolder,
25 | copyrightYear,
26 | dateCreated,
27 | dateModified,
28 | datePublished,
29 | editor,
30 | encoding,
31 | fileFormat,
32 | funder,
33 | keywords,keywords
34 | license,license
35 | producer,
36 | provider,
37 | publisher,
38 | sponsor,
39 | version,version
40 | isAccessibleForFree,
41 | isPartOf,
42 | hasPart,
43 | position,
44 | description,description
45 | identifier,
46 | name,name
47 | sameAs,
48 | url,homepage
49 | relatedLink,
50 | givenName,
51 | familyName,
52 | email,author.email/authors.email
53 | affiliation,
54 | identifier,
55 | name,
56 | address,
57 | type,
58 | id,
59 | softwareSuggestions,
60 | maintainer,
61 | contIntegration,
62 | buildInstructions,
63 | developmentStatus,
64 | embargoDate,
65 | funding,
66 | issueTracker,issue_tracker
67 | referencePublication,
68 | readme,
69 | 


--------------------------------------------------------------------------------
/swh/indexer/data/schema.org/CITATION:
--------------------------------------------------------------------------------
1 | swh:1:cnt:8e5a38fb91a1e8ef272f02a43e67dfaf56cb8e6d;origin=https://github.com/schemaorg/schemaorg;visit=swh:1:snp:0115fb30b95e3dd177271313dcaaa898888e7a4c;anchor=swh:1:rev:d9bc1d722034ea2ebd3e3ea97f8f21d3e692b3f0;path=/data/releases/28.0/schemaorgcontext.jsonld
2 | 


--------------------------------------------------------------------------------
/swh/indexer/fossology_license.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2016-2024  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | import logging
  7 | import subprocess
  8 | from typing import Any, Dict, List, Optional
  9 | 
 10 | import sentry_sdk
 11 | 
 12 | from swh.core.config import merge_configs
 13 | from swh.indexer.storage.interface import IndexerStorageInterface
 14 | from swh.indexer.storage.model import ContentLicenseRow
 15 | from swh.model import hashutil
 16 | from swh.objstorage.interface import CompositeObjId
 17 | 
 18 | from .indexer import ContentIndexer, write_to_temp
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def compute_license(path) -> Dict:
 24 |     """Determine license from file at path.
 25 | 
 26 |     Args:
 27 |         path: filepath to determine the license
 28 | 
 29 |     Returns:
 30 |         dict: A dict with the following keys:
 31 | 
 32 |         - licenses ([str]): associated detected licenses to path
 33 |         - path (bytes): content filepath
 34 | 
 35 |     """
 36 |     try:
 37 |         properties = subprocess.check_output(["nomossa", path], universal_newlines=True)
 38 |         if properties:
 39 |             res = properties.rstrip().split(" contains license(s) ")
 40 |             licenses = res[1].split(",")
 41 |         else:
 42 |             licenses = []
 43 | 
 44 |         return {
 45 |             "licenses": licenses,
 46 |             "path": path,
 47 |         }
 48 |     except subprocess.CalledProcessError:
 49 |         from os import path as __path
 50 | 
 51 |         logger.exception(
 52 |             "Problem during license detection for sha1 %s" % __path.basename(path)
 53 |         )
 54 |         sentry_sdk.capture_exception()
 55 |         return {
 56 |             "licenses": [],
 57 |             "path": path,
 58 |         }
 59 | 
 60 | 
 61 | DEFAULT_CONFIG: Dict[str, Any] = {
 62 |     "workdir": "/tmp/swh/indexer.fossology.license",
 63 |     "tools": {
 64 |         "name": "nomos",
 65 |         "version": "3.1.0rc2-31-ga2cbb8c",
 66 |         "configuration": {
 67 |             "command_line": "nomossa <filepath>",
 68 |         },
 69 |     },
 70 |     "write_batch_size": 1000,
 71 | }
 72 | 
 73 | 
 74 | class MixinFossologyLicenseIndexer:
 75 |     """Mixin fossology license indexer.
 76 | 
 77 |     See :class:`FossologyLicenseIndexer`
 78 | 
 79 |     """
 80 | 
 81 |     tool: Any
 82 |     idx_storage: IndexerStorageInterface
 83 | 
 84 |     def __init__(self, *args, **kwargs):
 85 |         super().__init__(*args, **kwargs)
 86 |         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 87 |         self.working_directory = self.config["workdir"]
 88 | 
 89 |     def index(
 90 |         self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs
 91 |     ) -> List[ContentLicenseRow]:
 92 |         """Index sha1s' content and store result.
 93 | 
 94 |         Args:
 95 |             id (bytes): content's identifier
 96 |             raw_content (bytes): associated raw content to content id
 97 | 
 98 |         Returns:
 99 |             dict: A dict, representing a content_license, with keys:
100 | 
101 |             - id (bytes): content's identifier (sha1)
102 |             - license (bytes): license in bytes
103 |             - path (bytes): path
104 |             - indexer_configuration_id (int): tool used to compute the output
105 | 
106 |         """
107 |         assert data is not None
108 |         with write_to_temp(
109 |             filename=hashutil.hash_to_hex(id["sha1"]),  # use the id as pathname
110 |             data=data,
111 |             working_directory=self.working_directory,
112 |         ) as content_path:
113 |             properties = compute_license(path=content_path)
114 |         return [
115 |             ContentLicenseRow(
116 |                 id=id["sha1"],
117 |                 indexer_configuration_id=self.tool["id"],
118 |                 license=license,
119 |             )
120 |             for license in properties["licenses"]
121 |         ]
122 | 
123 |     def persist_index_computations(
124 |         self, results: List[ContentLicenseRow]
125 |     ) -> Dict[str, int]:
126 |         """Persist the results in storage.
127 | 
128 |         Args:
129 |             results: list of content_license dict with the
130 |               following keys:
131 | 
132 |               - id (bytes): content's identifier (sha1)
133 |               - license (bytes): license in bytes
134 |               - path (bytes): path
135 | 
136 |         """
137 |         return self.idx_storage.content_fossology_license_add(results)
138 | 
139 | 
140 | class FossologyLicenseIndexer(
141 |     MixinFossologyLicenseIndexer, ContentIndexer[ContentLicenseRow]
142 | ):
143 |     """Indexer in charge of:
144 | 
145 |     - filtering out content already indexed
146 |     - reading content from objstorage per the content's id (sha1)
147 |     - computing {license, encoding} from that content
148 |     - store result in storage
149 | 
150 |     """
151 | 
152 |     pass
153 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_detector.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2017-2024 The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from typing import Dict, List
 7 | 
 8 | from swh.indexer.metadata_dictionary import INTRINSIC_MAPPINGS
 9 | from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
10 | from swh.objstorage.interface import CompositeObjId
11 | 
12 | 
13 | def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[CompositeObjId]]:
14 |     """
15 |     Detects files potentially containing metadata
16 | 
17 |     Args:
18 |         file_entries (list): list of files
19 | 
20 |     Returns:
21 |         dict: {mapping_filenames[name]:f['sha1']} (may be empty)
22 |     """
23 |     results = {}
24 |     for mapping_name, mapping in INTRINSIC_MAPPINGS.items():
25 |         matches = mapping.detect_metadata_files(files)
26 |         if matches:
27 |             results[mapping_name] = matches
28 |     return results
29 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2017-2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import collections
 7 | from typing import Dict, Type
 8 | 
 9 | import click
10 | 
11 | from . import (
12 |     cff,
13 |     codemeta,
14 |     composer,
15 |     dart,
16 |     gitea,
17 |     github,
18 |     maven,
19 |     npm,
20 |     nuget,
21 |     python,
22 |     ruby,
23 | )
24 | from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping
25 | 
26 | INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = {
27 |     "CffMapping": cff.CffMapping,
28 |     "CodemetaMapping": codemeta.CodemetaMapping,
29 |     "GemspecMapping": ruby.GemspecMapping,
30 |     "MavenMapping": maven.MavenMapping,
31 |     "NpmMapping": npm.NpmMapping,
32 |     "PubMapping": dart.PubspecMapping,
33 |     "PythonPkginfoMapping": python.PythonPkginfoMapping,
34 |     "ComposerMapping": composer.ComposerMapping,
35 |     "NuGetMapping": nuget.NuGetMapping,
36 | }
37 | 
38 | EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = {
39 |     "GiteaMapping": gitea.GiteaMapping,
40 |     "GitHubMapping": github.GitHubMapping,
41 |     "JsonSwordCodemetaMapping": codemeta.JsonSwordCodemetaMapping,
42 |     "SwordCodemetaMapping": codemeta.SwordCodemetaMapping,
43 | }
44 | 
45 | 
46 | MAPPINGS: Dict[str, Type[BaseMapping]] = {**INTRINSIC_MAPPINGS, **EXTRINSIC_MAPPINGS}
47 | 
48 | 
49 | def list_terms():
50 |     """Returns a dictionary with all supported CodeMeta terms as keys,
51 |     and the mappings that support each of them as values."""
52 |     d = collections.defaultdict(set)
53 |     for mapping in MAPPINGS.values():
54 |         for term in mapping.supported_terms():
55 |             d[term].add(mapping)
56 |     return d
57 | 
58 | 
59 | @click.command()
60 | @click.argument("mapping_name")
61 | @click.argument("file_name")
62 | def main(mapping_name: str, file_name: str):
63 |     from pprint import pprint
64 | 
65 |     with open(file_name, "rb") as fd:
66 |         file_content = fd.read()
67 |     res = MAPPINGS[mapping_name]().translate(file_content)
68 |     pprint(res)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/cff.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021-2024  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from typing import Any, Dict, List
 7 | import urllib.parse
 8 | 
 9 | from rdflib import BNode, Graph, Literal, URIRef
10 | import rdflib.term
11 | 
12 | from swh.indexer.codemeta import CROSSWALK_TABLE
13 | from swh.indexer.namespaces import RDF, SCHEMA
14 | 
15 | from .base import SingleFileIntrinsicMapping, YamlMapping
16 | from .utils import add_map
17 | 
18 | DOI = URIRef("https://doi.org/")
19 | SPDX = URIRef("https://spdx.org/licenses/")
20 | 
21 | 
22 | class CffMapping(YamlMapping, SingleFileIntrinsicMapping):
23 |     """Dedicated class for Citation (CITATION.cff) mapping and translation"""
24 | 
25 |     name = "cff"
26 |     filename = b"CITATION.cff"
27 |     mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
28 |     string_fields = ["title", "keywords", "license", "abstract", "version", "doi"]
29 |     date_fields = ["date-released"]
30 |     uri_fields = ["url", "repository-code"]
31 | 
32 |     def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
33 |         node: rdflib.term.Node
34 |         if (
35 |             "orcid" in author
36 |             and isinstance(author["orcid"], str)
37 |             and urllib.parse.urlparse(author["orcid"]).netloc
38 |         ):
39 |             node = URIRef(author["orcid"].strip())
40 |         else:
41 |             node = BNode()
42 |         graph.add((node, RDF.type, SCHEMA.Person))
43 |         if "affiliation" in author and isinstance(author["affiliation"], str):
44 |             affiliation = BNode()
45 |             graph.add((node, SCHEMA.affiliation, affiliation))
46 |             graph.add((affiliation, RDF.type, SCHEMA.Organization))
47 |             graph.add((affiliation, SCHEMA.name, Literal(author["affiliation"])))
48 |         if "family-names" in author and isinstance(author["family-names"], str):
49 |             graph.add((node, SCHEMA.familyName, Literal(author["family-names"])))
50 |         if "given-names" in author and isinstance(author["given-names"], str):
51 |             graph.add((node, SCHEMA.givenName, Literal(author["given-names"])))
52 |         return node
53 | 
54 |     def translate_authors(
55 |         self, graph: Graph, root: URIRef, authors: List[dict]
56 |     ) -> None:
57 |         add_map(graph, root, SCHEMA.author, self._translate_author, authors)
58 | 
59 |     def normalize_doi(self, s: str) -> URIRef:
60 |         if isinstance(s, str):
61 |             return DOI + s
62 | 
63 |     def normalize_license(self, s: str) -> URIRef:
64 |         if isinstance(s, str):
65 |             return SPDX + s
66 | 
67 |     def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]:
68 |         # https://github.com/citation-file-format/citation-file-format/blob/main/schema-guide.md#credit-redirection
69 |         return super()._translate_dict(
70 |             content_dict.get("preferred-citation", content_dict)
71 |         )
72 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/composer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import os.path
 7 | from typing import Optional
 8 | 
 9 | from rdflib import BNode, Graph, Literal, URIRef
10 | 
11 | from swh.indexer.codemeta import _DATA_DIR, read_crosstable
12 | from swh.indexer.namespaces import RDF, SCHEMA
13 | 
14 | from .base import JsonMapping, SingleFileIntrinsicMapping
15 | from .utils import add_map
16 | 
17 | SPDX = URIRef("https://spdx.org/licenses/")
18 | 
19 | 
20 | COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv")
21 | 
22 | with open(COMPOSER_TABLE_PATH) as fd:
23 |     (CODEMETA_TERMS, COMPOSER_TABLE) = read_crosstable(fd)
24 | 
25 | 
26 | class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping):
27 |     """Dedicated class for Packagist(composer.json) mapping and translation"""
28 | 
29 |     name = "composer"
30 |     mapping = COMPOSER_TABLE["Composer"]
31 |     filename = b"composer.json"
32 |     string_fields = [
33 |         "name",
34 |         "description",
35 |         "version",
36 |         "keywords",
37 |         "license",
38 |         "author",
39 |         "authors",
40 |     ]
41 |     uri_fields = ["homepage"]
42 | 
43 |     def normalize_license(self, s):
44 |         if isinstance(s, str):
45 |             return SPDX + s
46 | 
47 |     def _translate_author(self, graph: Graph, author) -> Optional[BNode]:
48 |         if not isinstance(author, dict):
49 |             return None
50 |         node = BNode()
51 |         graph.add((node, RDF.type, SCHEMA.Person))
52 | 
53 |         if isinstance(author.get("name"), str):
54 |             graph.add((node, SCHEMA.name, Literal(author["name"])))
55 |         if isinstance(author.get("email"), str):
56 |             graph.add((node, SCHEMA.email, Literal(author["email"])))
57 | 
58 |         return node
59 | 
60 |     def translate_authors(self, graph: Graph, root: URIRef, authors) -> None:
61 |         add_map(graph, root, SCHEMA.author, self._translate_author, authors)
62 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/dart.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import os.path
 7 | import re
 8 | 
 9 | from rdflib import RDF, BNode, Graph, Literal, URIRef
10 | 
11 | from swh.indexer.codemeta import _DATA_DIR, read_crosstable
12 | from swh.indexer.namespaces import SCHEMA
13 | 
14 | from .base import SingleFileIntrinsicMapping, YamlMapping
15 | from .utils import add_map
16 | 
17 | SPDX = URIRef("https://spdx.org/licenses/")
18 | 
19 | PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv")
20 | 
21 | with open(PUB_TABLE_PATH) as fd:
22 |     (CODEMETA_TERMS, PUB_TABLE) = read_crosstable(fd)
23 | 
24 | 
25 | def name_to_person(name):
26 |     return {
27 |         "@type": SCHEMA.Person,
28 |         SCHEMA.name: name,
29 |     }
30 | 
31 | 
32 | class PubspecMapping(YamlMapping, SingleFileIntrinsicMapping):
33 |     name = "pubspec"
34 |     filename = b"pubspec.yaml"
35 |     mapping = PUB_TABLE["Pubspec"]
36 |     string_fields = [
37 |         "repository",
38 |         "keywords",
39 |         "description",
40 |         "name",
41 |         "issue_tracker",
42 |         "platforms",
43 |         "license",
44 |         # license will only be used with the SPDX Identifier
45 |     ]
46 |     uri_fields = ["homepage"]
47 | 
48 |     def normalize_license(self, s):
49 |         if isinstance(s, str):
50 |             return SPDX + s
51 | 
52 |     def _translate_author(self, graph, s):
53 |         name_email_re = re.compile("(?P<name>.*?)( <(?P<email>.*)>)")
54 |         if isinstance(s, str):
55 |             author = BNode()
56 |             graph.add((author, RDF.type, SCHEMA.Person))
57 |             match = name_email_re.search(s)
58 |             if match:
59 |                 name = match.group("name")
60 |                 email = match.group("email")
61 |                 graph.add((author, SCHEMA.email, Literal(email)))
62 |             else:
63 |                 name = s
64 | 
65 |             graph.add((author, SCHEMA.name, Literal(name)))
66 | 
67 |             return author
68 | 
69 |     def translate_author(self, graph: Graph, root, s) -> None:
70 |         add_map(graph, root, SCHEMA.author, self._translate_author, [s])
71 | 
72 |     def translate_authors(self, graph: Graph, root, authors) -> None:
73 |         if isinstance(authors, list):
74 |             add_map(graph, root, SCHEMA.author, self._translate_author, authors)
75 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/nuget.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import os.path
 7 | import re
 8 | from typing import Any, Dict
 9 | 
10 | from rdflib import RDF, BNode, Graph, Literal, URIRef
11 | 
12 | from swh.indexer.codemeta import _DATA_DIR, read_crosstable
13 | from swh.indexer.namespaces import SCHEMA
14 | 
15 | from .base import SingleFileIntrinsicMapping, XmlMapping
16 | from .utils import add_list, add_url_if_valid
17 | 
18 | NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
19 | 
20 | with open(NUGET_TABLE_PATH) as fd:
21 |     (CODEMETA_TERMS, NUGET_TABLE) = read_crosstable(fd)
22 | 
23 | SPDX = URIRef("https://spdx.org/licenses/")
24 | 
25 | 
26 | class NuGetMapping(XmlMapping, SingleFileIntrinsicMapping):
27 |     """
28 |     dedicated class for NuGet (.nuspec) mapping and translation
29 |     """
30 | 
31 |     name = "nuget"
32 |     filename = re.compile(rb".*\.nuspec")
33 |     mapping = NUGET_TABLE["NuGet"]
34 |     mapping["copyright"] = URIRef("http://schema.org/copyrightNotice")
35 |     mapping["language"] = URIRef("http://schema.org/inLanguage")
36 |     string_fields = [
37 |         "description",
38 |         "version",
39 |         "name",
40 |         "tags",
41 |         "license",
42 |         "summary",
43 |         "copyright",
44 |         "language",
45 |     ]
46 |     uri_fields = ["projectUrl", "licenseUrl"]
47 | 
48 |     def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
49 |         return super()._translate_dict(d.get("package", {}).get("metadata", {}))
50 | 
51 |     def translate_repository(self, graph, root, v):
52 |         if isinstance(v, dict) and isinstance(v["@url"], str):
53 |             codemeta_key = URIRef(self.mapping["repository.url"])
54 |             add_url_if_valid(graph, root, codemeta_key, v["@url"])
55 | 
56 |     def normalize_license(self, v):
57 |         if isinstance(v, dict) and v["@type"] == "expression":
58 |             license_string = v["#text"]
59 |             if not bool(
60 |                 re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE)
61 |             ):
62 |                 return [
63 |                     SPDX + license_type.strip()
64 |                     for license_type in re.split(
65 |                         r" or ", license_string, flags=re.IGNORECASE
66 |                     )
67 |                 ]
68 |             else:
69 |                 return None
70 | 
71 |     def translate_authors(self, graph: Graph, root, s):
72 |         if isinstance(s, str):
73 |             authors = []
74 |             for author_name in s.split(","):
75 |                 author_name = author_name.strip()
76 |                 author = BNode()
77 |                 graph.add((author, RDF.type, SCHEMA.Person))
78 |                 graph.add((author, SCHEMA.name, Literal(author_name)))
79 |                 authors.append(author)
80 |             add_list(graph, root, SCHEMA.author, authors)
81 | 
82 |     def translate_releaseNotes(self, graph: Graph, root, s):
83 |         if isinstance(s, str):
84 |             graph.add((root, SCHEMA.releaseNotes, Literal(s)))
85 | 
86 |     def normalize_tags(self, s):
87 |         if isinstance(s, str):
88 |             return [Literal(tag) for tag in s.split(" ")]
89 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/python.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018-2021  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import email.parser
 7 | import email.policy
 8 | 
 9 | from rdflib import BNode, Literal, URIRef
10 | 
11 | from swh.indexer.codemeta import CROSSWALK_TABLE
12 | from swh.indexer.namespaces import RDF, SCHEMA
13 | 
14 | from .base import DictMapping, SingleFileIntrinsicMapping
15 | from .utils import add_list
16 | 
17 | _normalize_pkginfo_key = str.lower
18 | 
19 | 
20 | class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy):
21 |     def header_fetch_parse(self, name, value):
22 |         if hasattr(value, "name"):
23 |             return value
24 |         value = value.replace("\n        ", "\n")
25 |         return self.header_factory(name, value)
26 | 
27 | 
28 | class PythonPkginfoMapping(DictMapping, SingleFileIntrinsicMapping):
29 |     """Dedicated class for Python's PKG-INFO mapping and translation.
30 | 
31 |     https://www.python.org/dev/peps/pep-0314/"""
32 | 
33 |     name = "pkg-info"
34 |     filename = b"PKG-INFO"
35 |     mapping = {
36 |         _normalize_pkginfo_key(k): v
37 |         for (k, v) in CROSSWALK_TABLE["Python PKG-INFO"].items()
38 |     }
39 |     string_fields = [
40 |         "name",
41 |         "version",
42 |         "description",
43 |         "summary",
44 |         "author",
45 |         "author-email",
46 |     ]
47 | 
48 |     _parser = email.parser.BytesHeaderParser(policy=LinebreakPreservingEmailPolicy())
49 | 
50 |     def translate(self, content):
51 |         msg = self._parser.parsebytes(content)
52 |         d = {}
53 |         for key, value in msg.items():
54 |             key = _normalize_pkginfo_key(key)
55 |             if value != "UNKNOWN":
56 |                 d.setdefault(key, []).append(value)
57 |         return self._translate_dict(d)
58 | 
59 |     def extra_translation(self, graph, root, d):
60 |         author_names = list(graph.triples((root, SCHEMA.author, None)))
61 |         author_emails = list(graph.triples((root, SCHEMA.email, None)))
62 |         graph.remove((root, SCHEMA.author, None))
63 |         graph.remove((root, SCHEMA.email, None))
64 |         if author_names or author_emails:
65 |             author = BNode()
66 |             graph.add((author, RDF.type, SCHEMA.Person))
67 |             for _, _, author_name in author_names:
68 |                 graph.add((author, SCHEMA.name, author_name))
69 |             for _, _, author_email in author_emails:
70 |                 graph.add((author, SCHEMA.email, author_email))
71 |             add_list(graph, root, SCHEMA.author, [author])
72 | 
73 |     def normalize_home_page(self, urls):
74 |         return [URIRef(url) for url in urls]
75 | 
76 |     def normalize_keywords(self, keywords):
77 |         return [Literal(keyword) for s in keywords for keyword in s.split(" ")]
78 | 
79 |     def normalize_license(self, licenses):
80 |         return [URIRef("https://spdx.org/licenses/" + license) for license in licenses]
81 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/ruby.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018-2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | import ast
  7 | import itertools
  8 | import re
  9 | 
 10 | from rdflib import RDF, BNode, Graph, Literal, URIRef
 11 | 
 12 | from swh.indexer.codemeta import CROSSWALK_TABLE
 13 | from swh.indexer.namespaces import SCHEMA
 14 | 
 15 | from .base import DictMapping, SingleFileIntrinsicMapping
 16 | from .utils import add_map
 17 | 
 18 | SPDX = URIRef("https://spdx.org/licenses/")
 19 | 
 20 | 
 21 | def name_to_person(graph: Graph, name):
 22 |     if not isinstance(name, str):
 23 |         return None
 24 |     author = BNode()
 25 |     graph.add((author, RDF.type, SCHEMA.Person))
 26 |     graph.add((author, SCHEMA.name, Literal(name)))
 27 |     return author
 28 | 
 29 | 
 30 | class GemspecMapping(DictMapping, SingleFileIntrinsicMapping):
 31 |     name = "gemspec"
 32 |     filename = re.compile(rb".*\.gemspec")
 33 |     mapping = CROSSWALK_TABLE["Ruby Gem"]
 34 |     string_fields = ["name", "version", "description", "summary", "email"]
 35 |     uri_fields = ["homepage"]
 36 | 
 37 |     _re_spec_new = re.compile(r".*Gem::Specification.new +(do|\{) +\|.*\|.*")
 38 |     _re_spec_entry = re.compile(r"\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)")
 39 | 
 40 |     def translate(self, raw_content):
 41 |         try:
 42 |             raw_content = raw_content.decode()
 43 |         except UnicodeDecodeError:
 44 |             self.log.warning("Error unidecoding from %s", self.log_suffix)
 45 |             return
 46 | 
 47 |         # Skip lines before 'Gem::Specification.new'
 48 |         lines = itertools.dropwhile(
 49 |             lambda x: not self._re_spec_new.match(x), raw_content.split("\n")
 50 |         )
 51 | 
 52 |         try:
 53 |             next(lines)  # Consume 'Gem::Specification.new'
 54 |         except StopIteration:
 55 |             self.log.warning("Could not find Gem::Specification in %s", self.log_suffix)
 56 |             return
 57 | 
 58 |         content_dict = {}
 59 |         for line in lines:
 60 |             match = self._re_spec_entry.match(line)
 61 |             if match:
 62 |                 value = self.eval_ruby_expression(match.group("expr"))
 63 |                 if value:
 64 |                     content_dict[match.group("key")] = value
 65 |         return self._translate_dict(content_dict)
 66 | 
 67 |     def eval_ruby_expression(self, expr):
 68 |         """Very simple evaluator of Ruby expressions.
 69 | 
 70 |         >>> GemspecMapping().eval_ruby_expression('"Foo bar"')
 71 |         'Foo bar'
 72 |         >>> GemspecMapping().eval_ruby_expression("'Foo bar'")
 73 |         'Foo bar'
 74 |         >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']")
 75 |         ['Foo', 'bar']
 76 |         >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze")
 77 |         'Foo bar'
 78 |         >>> GemspecMapping().eval_ruby_expression( \
 79 |                 "['Foo'.freeze, 'bar'.freeze]")
 80 |         ['Foo', 'bar']
 81 |         """
 82 | 
 83 |         def evaluator(node):
 84 |             if isinstance(node, ast.Str):
 85 |                 return node.s
 86 |             elif isinstance(node, ast.List):
 87 |                 res = []
 88 |                 for element in node.elts:
 89 |                     val = evaluator(element)
 90 |                     if not val:
 91 |                         return
 92 |                     res.append(val)
 93 |                 return res
 94 | 
 95 |         expr = expr.replace(".freeze", "")
 96 |         try:
 97 |             # We're parsing Ruby expressions here, but Python's
 98 |             # ast.parse works for very simple Ruby expressions
 99 |             # (mainly strings delimited with " or ', and lists
100 |             # of such strings).
101 |             tree = ast.parse(expr, mode="eval")
102 |         except (SyntaxError, ValueError):
103 |             return
104 |         if isinstance(tree, ast.Expression):
105 |             return evaluator(tree.body)
106 | 
107 |     def normalize_license(self, s):
108 |         if isinstance(s, str):
109 |             return SPDX + s
110 | 
111 |     def normalize_licenses(self, licenses):
112 |         if isinstance(licenses, list):
113 |             return [SPDX + license for license in licenses if isinstance(license, str)]
114 | 
115 |     def translate_author(self, graph: Graph, root, author):
116 |         if isinstance(author, str):
117 |             add_map(graph, root, SCHEMA.author, name_to_person, [author])
118 | 
119 |     def translate_authors(self, graph: Graph, root, authors):
120 |         if isinstance(authors, list):
121 |             add_map(graph, root, SCHEMA.author, name_to_person, authors)
122 | 


--------------------------------------------------------------------------------
/swh/indexer/metadata_dictionary/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | 
  7 | import json
  8 | from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar
  9 | import urllib.parse
 10 | 
 11 | from pyld import jsonld
 12 | from rdflib import RDF, Graph, URIRef
 13 | import rdflib.term
 14 | 
 15 | from swh.indexer.codemeta import _document_loader
 16 | 
 17 | 
 18 | def prettyprint_graph(graph: Graph, root: URIRef):
 19 |     s = graph.serialize(format="application/ld+json")
 20 |     jsonld_graph = json.loads(s)
 21 |     translated_metadata = jsonld.frame(
 22 |         jsonld_graph,
 23 |         {"@id": str(root)},
 24 |         options={
 25 |             "documentLoader": _document_loader,
 26 |             "processingMode": "json-ld-1.1",
 27 |         },
 28 |     )
 29 |     print(json.dumps(translated_metadata, indent=4))
 30 | 
 31 | 
 32 | def add_list(
 33 |     graph: Graph,
 34 |     subject: rdflib.term.Node,
 35 |     predicate: rdflib.term.Identifier,
 36 |     objects: Sequence[rdflib.term.Node],
 37 | ) -> None:
 38 |     """Adds triples to the ``graph`` so that they are equivalent to this
 39 |     JSON-LD object::
 40 | 
 41 |         {
 42 |             "@id": subject,
 43 |             predicate: {"@list": objects}
 44 |         }
 45 | 
 46 |     This is a naive implementation of
 47 |     https://json-ld.org/spec/latest/json-ld-api/#list-to-rdf-conversion
 48 |     """
 49 |     # JSON-LD's @list is syntactic sugar for a linked list / chain in the RDF graph,
 50 |     # which is what we are going to construct, starting from the end:
 51 |     last_link: rdflib.term.Node
 52 |     last_link = RDF.nil
 53 |     for item in reversed(objects):
 54 |         link = rdflib.BNode()
 55 |         graph.add((link, RDF.first, item))
 56 |         graph.add((link, RDF.rest, last_link))
 57 |         last_link = link
 58 |     graph.add((subject, predicate, last_link))
 59 | 
 60 | 
 61 | TValue = TypeVar("TValue")
 62 | 
 63 | 
 64 | def add_map(
 65 |     graph: Graph,
 66 |     subject: rdflib.term.Node,
 67 |     predicate: rdflib.term.Identifier,
 68 |     f: Callable[[Graph, TValue], Optional[rdflib.term.Node]],
 69 |     values: Iterable[TValue],
 70 | ) -> None:
 71 |     """Helper for :func:`add_list` that takes a mapper function ``f``."""
 72 |     nodes = [f(graph, value) for value in values]
 73 |     add_list(graph, subject, predicate, [node for node in nodes if node])
 74 | 
 75 | 
 76 | def add_url_if_valid(
 77 |     graph: Graph,
 78 |     subject: rdflib.term.Node,
 79 |     predicate: rdflib.term.Identifier,
 80 |     url: Any,
 81 | ) -> None:
 82 |     """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed.
 83 | 
 84 |     This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91
 85 |     to drop URLs that are blatantly invalid early, so PyLD does not crash.
 86 | 
 87 |     >>> from pprint import pprint
 88 |     >>> graph = Graph()
 89 |     >>> subject = rdflib.term.URIRef("http://example.org/test-software")
 90 |     >>> predicate = rdflib.term.URIRef("http://schema.org/license")
 91 |     >>> add_url_if_valid(
 92 |     ...     graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt"
 93 |     ... )
 94 |     >>> add_url_if_valid(
 95 |     ...     graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt"
 96 |     ... )
 97 |     >>> add_url_if_valid(
 98 |     ...     graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt"
 99 |     ... )
100 |     >>> add_url_if_valid(
101 |     ...     graph, subject, predicate, 42
102 |     ... )
103 |     >>> pprint(set(graph.triples((subject, predicate, None))))
104 |     {(rdflib.term.URIRef('http://example.org/test-software'),
105 |       rdflib.term.URIRef('http://schema.org/license'),
106 |       rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
107 |     """
108 |     if not isinstance(url, str):
109 |         return
110 |     try:
111 |         parsed_url = urllib.parse.urlparse(url)
112 |     except Exception:
113 |         return
114 |     if " " in url or not parsed_url.netloc:
115 |         return
116 |     graph.add((subject, predicate, rdflib.term.URIRef(url)))
117 | 


--------------------------------------------------------------------------------
/swh/indexer/mimetype.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2016-2024  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | import magic
  9 | 
 10 | from swh.core.config import merge_configs
 11 | from swh.indexer.storage.interface import IndexerStorageInterface
 12 | from swh.indexer.storage.model import ContentMimetypeRow
 13 | from swh.objstorage.interface import CompositeObjId
 14 | 
 15 | from .indexer import ContentIndexer
 16 | 
 17 | if not hasattr(magic.Magic, "from_buffer"):
 18 |     raise ImportError(
 19 |         'Expected "import magic" to import python-magic, but file_magic '
 20 |         "was imported instead."
 21 |     )
 22 | 
 23 | 
 24 | def compute_mimetype_encoding(raw_content: bytes) -> Dict[str, str]:
 25 |     """Determine mimetype and encoding from the raw content.
 26 | 
 27 |     Args:
 28 |         raw_content: content's raw data
 29 | 
 30 |     Returns:
 31 |         dict: mimetype and encoding key and corresponding values.
 32 | 
 33 |     """
 34 |     m = magic.Magic(mime=True, mime_encoding=True)
 35 |     res = m.from_buffer(raw_content)
 36 |     try:
 37 |         mimetype, encoding = res.split("; charset=")
 38 |     except ValueError:
 39 |         mimetype, encoding = res, ""
 40 |     return {
 41 |         "mimetype": mimetype,
 42 |         "encoding": encoding,
 43 |     }
 44 | 
 45 | 
 46 | DEFAULT_CONFIG: Dict[str, Any] = {
 47 |     "tools": {
 48 |         "name": "file",
 49 |         "version": "1:5.30-1+deb9u1",
 50 |         "configuration": {"type": "library", "debian-package": "python3-magic"},
 51 |     },
 52 |     "write_batch_size": 1000,
 53 | }
 54 | 
 55 | 
 56 | class MixinMimetypeIndexer:
 57 |     """Mixin mimetype indexer.
 58 | 
 59 |     See :class:`MimetypeIndexer`
 60 | 
 61 |     """
 62 | 
 63 |     tool: Any
 64 |     idx_storage: IndexerStorageInterface
 65 | 
 66 |     def __init__(self, *args, **kwargs):
 67 |         super().__init__(*args, **kwargs)
 68 |         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 69 | 
 70 |     def index(
 71 |         self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs
 72 |     ) -> List[ContentMimetypeRow]:
 73 |         """Index sha1s' content and store result.
 74 | 
 75 |         Args:
 76 |             id: content's identifier
 77 |             data: raw content in bytes
 78 | 
 79 |         Returns:
 80 |             dict: content's mimetype; dict keys being
 81 | 
 82 |             - id: content's identifier (sha1)
 83 |             - mimetype: mimetype in bytes
 84 |             - encoding: encoding in bytes
 85 | 
 86 |         """
 87 |         assert data is not None
 88 |         properties = compute_mimetype_encoding(data)
 89 |         return [
 90 |             ContentMimetypeRow(
 91 |                 id=id["sha1"],
 92 |                 indexer_configuration_id=self.tool["id"],
 93 |                 mimetype=properties["mimetype"],
 94 |                 encoding=properties["encoding"],
 95 |             )
 96 |         ]
 97 | 
 98 |     def persist_index_computations(
 99 |         self, results: List[ContentMimetypeRow]
100 |     ) -> Dict[str, int]:
101 |         """Persist the results in storage.
102 | 
103 |         Args:
104 |             results: list of content's mimetype dicts
105 |               (see :meth:`.index`)
106 | 
107 |         """
108 |         return self.idx_storage.content_mimetype_add(results)
109 | 
110 | 
111 | class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]):
112 |     """Mimetype Indexer working on list of content identifiers.
113 | 
114 |     It:
115 | 
116 |     - (optionally) filters out content already indexed (cf.
117 |       :meth:`.filter`)
118 |     - reads content from objstorage per the content's id (sha1)
119 |     - computes {mimetype, encoding} from that content
120 |     - stores result in storage
121 | 
122 |     """
123 | 
124 |     def filter(self, ids: List[CompositeObjId]):
125 |         """Filter out known sha1s and return only missing ones."""
126 |         yield from self.idx_storage.content_mimetype_missing(
127 |             (
128 |                 {
129 |                     "id": id["sha1"],
130 |                     "indexer_configuration_id": self.tool["id"],
131 |                 }
132 |                 for id in ids
133 |             )
134 |         )
135 | 


--------------------------------------------------------------------------------
/swh/indexer/namespaces.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from rdflib import Namespace as _Namespace
 7 | from rdflib import RDF  # noqa
 8 | 
 9 | SCHEMA = _Namespace("http://schema.org/")
10 | CODEMETA = _Namespace("https://codemeta.github.io/terms/")
11 | FORGEFED = _Namespace("https://forgefed.org/ns#")
12 | ACTIVITYSTREAMS = _Namespace("https://www.w3.org/ns/activitystreams#")
13 | SPDX_LICENSES = _Namespace("https://spdx.org/licenses/")
14 | XSD = _Namespace("http://www.w3.org/2001/XMLSchema#")
15 | 


--------------------------------------------------------------------------------
/swh/indexer/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/storage/api/__init__.py


--------------------------------------------------------------------------------
/swh/indexer/storage/api/client.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2015-2018  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from swh.core.api import RPCClient
 7 | from swh.indexer.storage.exc import (
 8 |     DuplicateId,
 9 |     IndexerStorageAPIError,
10 |     IndexerStorageArgumentException,
11 | )
12 | 
13 | from ..interface import IndexerStorageInterface
14 | from .serializers import DECODERS, ENCODERS
15 | 
16 | 
17 | class RemoteStorage(RPCClient):
18 |     """Proxy to a remote storage API"""
19 | 
20 |     backend_class = IndexerStorageInterface
21 |     api_exception = IndexerStorageAPIError
22 |     reraise_exceptions = [IndexerStorageArgumentException, DuplicateId]
23 |     extra_type_decoders = DECODERS
24 |     extra_type_encoders = ENCODERS
25 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/api/serializers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | """Decoder and encoders for swh-model objects."""
 7 | 
 8 | from typing import Callable, Dict, List, Tuple
 9 | 
10 | import swh.indexer.storage.model as idx_model
11 | 
12 | 
13 | def _encode_model_object(obj):
14 |     d = obj.to_dict()
15 |     d["__type__"] = type(obj).__name__
16 |     return d
17 | 
18 | 
19 | ENCODERS: List[Tuple[type, str, Callable]] = [
20 |     (idx_model.BaseRow, "idx_model", _encode_model_object),
21 | ]
22 | 
23 | 
24 | DECODERS: Dict[str, Callable] = {
25 |     "idx_model": lambda d: getattr(idx_model, d.pop("__type__")).from_dict(d),
26 | }
27 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/api/server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2015-2020  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | import logging
  7 | import os
  8 | from typing import Any, Dict, Optional
  9 | import warnings
 10 | 
 11 | from swh.core import config
 12 | from swh.core.api import RPCServerApp
 13 | from swh.core.api import encode_data_server as encode_data
 14 | from swh.core.api import error_handler
 15 | from swh.indexer.storage import INDEXER_CFG_KEY, get_indexer_storage
 16 | from swh.indexer.storage.exc import IndexerStorageArgumentException
 17 | from swh.indexer.storage.interface import IndexerStorageInterface
 18 | 
 19 | from .serializers import DECODERS, ENCODERS
 20 | 
 21 | 
 22 | def get_storage():
 23 |     global storage
 24 |     if not storage:
 25 |         storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY])
 26 | 
 27 |     return storage
 28 | 
 29 | 
 30 | class IndexerStorageServerApp(RPCServerApp):
 31 |     extra_type_decoders = DECODERS
 32 |     extra_type_encoders = ENCODERS
 33 | 
 34 | 
 35 | app = IndexerStorageServerApp(
 36 |     __name__, backend_class=IndexerStorageInterface, backend_factory=get_storage
 37 | )
 38 | storage = None
 39 | 
 40 | 
 41 | @app.errorhandler(Exception)
 42 | def my_error_handler(exception):
 43 |     return error_handler(exception, encode_data)
 44 | 
 45 | 
 46 | @app.errorhandler(IndexerStorageArgumentException)
 47 | def argument_error_handler(exception):
 48 |     return error_handler(exception, encode_data, status_code=400)
 49 | 
 50 | 
 51 | @app.route("/")
 52 | def index():
 53 |     return "SWH Indexer Storage API server"
 54 | 
 55 | 
 56 | api_cfg = None
 57 | 
 58 | 
 59 | def load_and_check_config(
 60 |     config_path: Optional[str],
 61 | ) -> Dict[str, Any]:
 62 |     """Check the minimal configuration is set to run the api or raise an
 63 |        error explanation.
 64 | 
 65 |     Args:
 66 |         config_path: Path to the configuration file to load
 67 |         cls: backend class (as declared in swh.indexer.classes entry point)
 68 | 
 69 |     Raises:
 70 |         Error if the setup is not as expected
 71 | 
 72 |     Returns:
 73 |         configuration as a dict
 74 | 
 75 |     """
 76 |     if not config_path:
 77 |         raise EnvironmentError("Configuration file must be defined")
 78 | 
 79 |     if not os.path.exists(config_path):
 80 |         raise FileNotFoundError(f"Configuration file {config_path} does not exist")
 81 | 
 82 |     cfg = config.read(config_path)
 83 |     if "indexer.storage" in cfg:
 84 |         warnings.warn(
 85 |             "The 'indexer.storage' configuration section should be renamed "
 86 |             "as 'indexer_storage'",
 87 |             DeprecationWarning,
 88 |         )
 89 |         cfg["indexer_storage"] = cfg.pop("indexer.storage")
 90 |     if "indexer_storage" not in cfg:
 91 |         raise KeyError("Missing '%indexer_storage' configuration")
 92 | 
 93 |     return cfg
 94 | 
 95 | 
 96 | def make_app_from_configfile():
 97 |     """Run the WSGI app from the webserver, loading the configuration from
 98 |     a configuration file.
 99 | 
100 |     SWH_CONFIG_FILENAME environment variable defines the
101 |     configuration path to load.
102 | 
103 |     """
104 |     global api_cfg
105 |     if not api_cfg:
106 |         config_path = os.environ.get("SWH_CONFIG_FILENAME")
107 |         api_cfg = load_and_check_config(config_path)
108 |         app.config.update(api_cfg)
109 |     handler = logging.StreamHandler()
110 |     app.logger.addHandler(handler)
111 |     return app
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     print("Deprecated. Use swh-indexer")
116 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/converters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2015-2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | 
 7 | def db_to_mimetype(mimetype):
 8 |     """Convert a mimetype entry into a ready mimetype output."""
 9 |     return {
10 |         "id": mimetype["id"],
11 |         "encoding": mimetype["encoding"],
12 |         "mimetype": mimetype["mimetype"],
13 |         "tool": {
14 |             "id": mimetype["tool_id"],
15 |             "name": mimetype["tool_name"],
16 |             "version": mimetype["tool_version"],
17 |             "configuration": mimetype["tool_configuration"],
18 |         },
19 |     }
20 | 
21 | 
22 | def db_to_metadata(metadata):
23 |     """Convert a metadata entry into a ready metadata output."""
24 |     metadata["tool"] = {
25 |         "id": metadata["tool_id"],
26 |         "name": metadata["tool_name"],
27 |         "version": metadata["tool_version"],
28 |         "configuration": metadata["tool_configuration"],
29 |     }
30 |     del metadata["tool_id"], metadata["tool_configuration"]
31 |     del metadata["tool_version"], metadata["tool_name"]
32 |     return metadata
33 | 
34 | 
35 | def db_to_fossology_license(license):
36 |     return {
37 |         "id": license["id"],
38 |         "license": license["license"],
39 |         "tool": {
40 |             "id": license["tool_id"],
41 |             "name": license["tool_name"],
42 |             "version": license["tool_version"],
43 |             "configuration": license["tool_configuration"],
44 |         },
45 |     }
46 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/exc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | 
 7 | class IndexerStorageAPIError(Exception):
 8 |     """Generic error of the indexer storage."""
 9 | 
10 |     pass
11 | 
12 | 
13 | class IndexerStorageArgumentException(Exception):
14 |     """Argument passed to an IndexerStorage endpoint is invalid."""
15 | 
16 |     pass
17 | 
18 | 
19 | class DuplicateId(IndexerStorageArgumentException):
20 |     """The same identifier is present more than once."""
21 | 
22 |     pass
23 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2019-2020  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from functools import wraps
 7 | import logging
 8 | 
 9 | from swh.core.statsd import statsd
10 | 
11 | OPERATIONS_METRIC = "swh_indexer_storage_operations_total"
12 | OPERATIONS_UNIT_METRIC = "swh_indexer_storage_operations_{unit}_total"
13 | DURATION_METRIC = "swh_indexer_storage_request_duration_seconds"
14 | 
15 | 
16 | def timed(f):
17 |     """Time that function!"""
18 | 
19 |     @wraps(f)
20 |     def d(*a, **kw):
21 |         with statsd.timed(DURATION_METRIC, tags={"endpoint": f.__name__}):
22 |             return f(*a, **kw)
23 | 
24 |     return d
25 | 
26 | 
27 | def send_metric(metric, count, method_name):
28 |     """Send statsd metric with count for method `method_name`
29 | 
30 |     If count is 0, the metric is discarded.  If the metric is not
31 |     parseable, the metric is discarded with a log message.
32 | 
33 |     Args:
34 |         metric (str): Metric's name (e.g content:add, content:add:bytes)
35 |         count (int): Associated value for the metric
36 |         method_name (str): Method's name
37 | 
38 |     Returns:
39 |         Bool to explicit if metric has been set or not
40 |     """
41 |     if count == 0:
42 |         return False
43 | 
44 |     metric_type = metric.split(":")
45 |     _length = len(metric_type)
46 |     if _length == 2:
47 |         object_type, operation = metric_type
48 |         metric_name = OPERATIONS_METRIC
49 |     elif _length == 3:
50 |         object_type, operation, unit = metric_type
51 |         metric_name = OPERATIONS_UNIT_METRIC.format(unit=unit)
52 |     else:
53 |         logging.warning("Skipping unknown metric {%s: %s}" % (metric, count))
54 |         return False
55 | 
56 |     statsd.increment(
57 |         metric_name,
58 |         count,
59 |         tags={
60 |             "endpoint": method_name,
61 |             "object_type": object_type,
62 |             "operation": operation,
63 |         },
64 |     )
65 |     return True
66 | 
67 | 
68 | def process_metrics(f):
69 |     """Increment object counters for the decorated function."""
70 | 
71 |     @wraps(f)
72 |     def d(*a, **kw):
73 |         r = f(*a, **kw)
74 |         for metric, count in r.items():
75 |             send_metric(metric=metric, count=count, method_name=f.__name__)
76 | 
77 |         return r
78 | 
79 |     return d
80 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2020-2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | """Classes used internally by the in-memory idx-storage, and will be
  7 | used for the interface of the idx-storage in the near future."""
  8 | 
  9 | from __future__ import annotations
 10 | 
 11 | import json
 12 | from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
 13 | 
 14 | import attr
 15 | from typing_extensions import Final
 16 | 
 17 | from swh.model.model import Sha1Git
 18 | 
 19 | TSelf = TypeVar("TSelf")
 20 | 
 21 | 
 22 | @attr.s
 23 | class BaseRow:
 24 |     UNIQUE_KEY_FIELDS: Tuple = ("id",)
 25 | 
 26 |     id = attr.ib(type=Any)
 27 |     indexer_configuration_id = attr.ib(type=Optional[int], default=None, kw_only=True)
 28 |     tool = attr.ib(type=Optional[Dict], default=None, kw_only=True)
 29 | 
 30 |     def __attrs_post_init__(self):
 31 |         if self.indexer_configuration_id is None and self.tool is None:
 32 |             raise TypeError("Either indexer_configuration_id or tool must be not None.")
 33 |         if self.indexer_configuration_id is not None and self.tool is not None:
 34 |             raise TypeError(
 35 |                 "indexer_configuration_id and tool are mutually exclusive; "
 36 |                 "only one may be not None."
 37 |             )
 38 | 
 39 |     def anonymize(self: TSelf) -> Optional[TSelf]:
 40 |         # Needed to implement swh.journal.writer.ValueProtocol
 41 |         return None
 42 | 
 43 |     def to_dict(self) -> Dict[str, Any]:
 44 |         """Wrapper of `attr.asdict` that can be overridden by subclasses
 45 |         that have special handling of some of the fields."""
 46 |         d = attr.asdict(self, recurse=False)
 47 | 
 48 |         if d["indexer_configuration_id"] is None:
 49 |             del d["indexer_configuration_id"]
 50 |         if d["tool"] is None:
 51 |             del d["tool"]
 52 | 
 53 |         return d
 54 | 
 55 |     @classmethod
 56 |     def from_dict(cls: Type[TSelf], d) -> TSelf:
 57 |         return cls(**d)
 58 | 
 59 |     def unique_key(self) -> Dict:
 60 |         if not self.tool:
 61 |             raise ValueError(
 62 |                 f"Cannot compute unique_key of {self.__class__.__name__} with no tool "
 63 |                 f"dictionary (indexer_configuration_id was given instead)"
 64 |             )
 65 | 
 66 |         tool_dict = {
 67 |             "tool_name": self.tool["name"],
 68 |             "tool_version": self.tool["version"],
 69 |             "tool_configuration": json.dumps(
 70 |                 self.tool["configuration"], sort_keys=True
 71 |             ),
 72 |         }
 73 | 
 74 |         return {
 75 |             **{key: getattr(self, key) for key in self.UNIQUE_KEY_FIELDS},
 76 |             **tool_dict,
 77 |         }
 78 | 
 79 | 
 80 | @attr.s
 81 | class ContentMimetypeRow(BaseRow):
 82 |     object_type: Final = "content_mimetype"
 83 | 
 84 |     id = attr.ib(type=Sha1Git)
 85 |     mimetype = attr.ib(type=str)
 86 |     encoding = attr.ib(type=str)
 87 | 
 88 | 
 89 | @attr.s
 90 | class ContentLicenseRow(BaseRow):
 91 |     object_type: Final = "content_fossology_license"
 92 |     UNIQUE_KEY_FIELDS = ("id", "license")
 93 | 
 94 |     id = attr.ib(type=Sha1Git)
 95 |     license = attr.ib(type=str)
 96 | 
 97 | 
 98 | @attr.s
 99 | class ContentMetadataRow(BaseRow):
100 |     object_type: Final = "content_metadata"
101 | 
102 |     id = attr.ib(type=Sha1Git)
103 |     metadata = attr.ib(type=Dict[str, Any])
104 | 
105 | 
106 | @attr.s
107 | class DirectoryIntrinsicMetadataRow(BaseRow):
108 |     object_type: Final = "directory_intrinsic_metadata"
109 | 
110 |     id = attr.ib(type=Sha1Git)
111 |     metadata = attr.ib(type=Dict[str, Any])
112 |     mappings = attr.ib(type=List[str])
113 | 
114 | 
115 | @attr.s
116 | class OriginIntrinsicMetadataRow(BaseRow):
117 |     object_type: Final = "origin_intrinsic_metadata"
118 | 
119 |     id = attr.ib(type=str)
120 |     metadata = attr.ib(type=Dict[str, Any])
121 |     from_directory = attr.ib(type=Sha1Git)
122 |     mappings = attr.ib(type=List[str])
123 | 
124 | 
125 | @attr.s
126 | class OriginExtrinsicMetadataRow(BaseRow):
127 |     object_type: Final = "origin_extrinsic_metadata"
128 | 
129 |     id = attr.ib(type=str)
130 |     """origin URL"""
131 |     metadata = attr.ib(type=Dict[str, Any])
132 |     from_remd_id = attr.ib(type=Sha1Git)
133 |     """id of the RawExtrinsicMetadata object used as source for indexed metadata"""
134 |     mappings = attr.ib(type=List[str])
135 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/10-superuser-init.sql:
--------------------------------------------------------------------------------
1 | create extension if not exists btree_gist;
2 | create extension if not exists pgcrypto;
3 | 
4 | create or replace language plpgsql;
5 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/20-enums.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/storage/sql/20-enums.sql


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/50-data.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/storage/sql/50-data.sql


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/60-indexes.sql:
--------------------------------------------------------------------------------
 1 | -- fossology_license
 2 | create unique index fossology_license_pkey on fossology_license(id);
 3 | alter table fossology_license add primary key using index fossology_license_pkey;
 4 | 
 5 | create unique index on fossology_license(name);
 6 | 
 7 | -- indexer_configuration
 8 | create unique index concurrently indexer_configuration_pkey on indexer_configuration(id);
 9 | alter table indexer_configuration add primary key using index indexer_configuration_pkey;
10 | 
11 | create unique index on indexer_configuration(tool_name, tool_version, tool_configuration);
12 | 
13 | -- content_metadata
14 | create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id);
15 | alter table content_metadata add primary key using index content_metadata_pkey;
16 | 
17 | alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
18 | alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
19 | 
20 | -- directory_intrinsic_metadata
21 | create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id);
22 | alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey;
23 | 
24 | alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
25 | alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey;
26 | 
27 | -- content_mimetype
28 | create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id);
29 | alter table content_mimetype add primary key using index content_mimetype_pkey;
30 | 
31 | alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
32 | alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey;
33 | 
34 | create index on content_mimetype(id) where mimetype like 'text/%';
35 | 
36 | -- content_fossology_license
37 | create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id);
38 | alter table content_fossology_license add primary key using index content_fossology_license_pkey;
39 | 
40 | alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid;
41 | alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey;
42 | 
43 | alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
44 | alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey;
45 | 
46 | -- origin_intrinsic_metadata
47 | create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id);
48 | alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey;
49 | 
50 | alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
51 | alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
52 | 
53 | create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
54 | create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
55 | 
56 | -- origin_extrinsic_metadata
57 | create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id);
58 | alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey;
59 | 
60 | alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
61 | alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey;
62 | 
63 | create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector);
64 | create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings);
65 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/115.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 114
 3 | -- to_version: 115
 4 | -- description: Remove temporary table use in reading api
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(115, now(), 'Work In Progress');
 8 | 
 9 | drop function swh_mktemp_content_mimetype_missing();
10 | drop function swh_content_mimetype_missing();
11 | 
12 | drop function swh_content_mimetype_get();
13 | drop type content_mimetype_signature;
14 | 
15 | drop function swh_mktemp_content_language_missing();
16 | drop function swh_content_language_missing();
17 | 
18 | drop function swh_content_language_get();
19 | drop type content_language_signature;
20 | 
21 | drop function swh_mktemp_content_ctags_missing();
22 | drop function swh_content_ctags_missing();
23 | 
24 | drop function swh_content_ctags_get();
25 | --drop type content_ctags_signature;  -- still used in swh_content_ctags_search
26 | 
27 | drop function swh_content_fossology_license_get();
28 | drop type content_fossology_license_signature;
29 | 
30 | drop function swh_mktemp_content_metadata_missing();
31 | drop function swh_content_metadata_missing();
32 | 
33 | drop function swh_content_metadata_get();
34 | drop type content_metadata_signature;
35 | 
36 | drop function swh_mktemp_revision_metadata_missing();
37 | drop function swh_revision_metadata_missing();
38 | 
39 | drop function swh_revision_metadata_get();
40 | drop type revision_metadata_signature;
41 | 
42 | drop function swh_mktemp_bytea();
43 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/116.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 115
 3 | -- to_version: 116
 4 | -- description:
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(116, now(), 'Work In Progress');
 8 | 
 9 | drop table origin_metadata_translation;
10 | 
11 | create table origin_intrinsic_metadata(
12 |   origin_id                 bigserial  not null,
13 |   metadata                  jsonb,
14 |   indexer_configuration_id  bigint     not null,
15 |   from_revision             sha1_git   not null
16 | );
17 | 
18 | comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
19 | comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin';
20 | comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
21 | comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
22 | comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
23 | 
24 | -- create a temporary table for retrieving origin_intrinsic_metadata
25 | create or replace function swh_mktemp_origin_intrinsic_metadata()
26 |     returns void
27 |     language sql
28 | as $$
29 |   create temporary table tmp_origin_intrinsic_metadata (
30 |     like origin_intrinsic_metadata including defaults
31 |   ) on commit drop;
32 | $$;
33 | 
34 | comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata';
35 | 
36 | 
37 | -- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
38 | -- overwriting duplicates if conflict_update is true, skipping duplicates
39 | -- otherwise.
40 | --
41 | -- If filtering duplicates is in order, the call to
42 | -- swh_origin_intrinsic_metadata_missing must take place before calling this
43 | -- function.
44 | --
45 | -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
46 | -- tmp_origin_intrinsic_metadata, 2. call this function
47 | create or replace function swh_origin_intrinsic_metadata_add(
48 |         conflict_update boolean)
49 |     returns void
50 |     language plpgsql
51 | as $$
52 | begin
53 |     if conflict_update then
54 |       insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision)
55 |       select origin_id, metadata, indexer_configuration_id, from_revision
56 |       from tmp_origin_intrinsic_metadata
57 |             on conflict(origin_id, indexer_configuration_id)
58 |                 do update set metadata = excluded.metadata;
59 | 
60 |     else
61 |         insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision)
62 |         select origin_id, metadata, indexer_configuration_id, from_revision
63 |       from tmp_origin_intrinsic_metadata
64 |             on conflict(origin_id, indexer_configuration_id)
65 |             do nothing;
66 |     end if;
67 |     return;
68 | end
69 | $$;
70 | 
71 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
72 | 
73 | 
74 | -- origin_intrinsic_metadata
75 | create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(origin_id, indexer_configuration_id);
76 | alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey;
77 | 
78 | alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
79 | alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
80 | alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid;
81 | alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey;
82 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/117.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 116
 3 | -- to_version: 117
 4 | -- description: Add fulltext search index for origin intrinsic metadata
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(117, now(), 'Work In Progress');
 8 | 
 9 | alter table origin_intrinsic_metadata add column metadata_tsvector tsvector;
10 | update origin_intrinsic_metadata set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
11 | create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
12 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/118.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 117
 3 | -- to_version: 118
 4 | -- description: content_mimetype: Migrate bytes column to text
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(118, now(), 'Work In Progress');
 8 | 
 9 | alter table content_mimetype
10 |   alter column mimetype set data type text
11 |     using convert_from(mimetype, 'utf-8'),
12 |   alter column encoding set data type text
13 |     using convert_from(encoding, 'utf-8');
14 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/119.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 118
 3 | -- to_version: 119
 4 | -- description: metadata tables: add 'mappings' column
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(119, now(), 'Work In Progress');
 8 | 
 9 | alter table revision_metadata
10 |   add column mappings text array not null default '{}';
11 | alter table revision_metadata
12 |   alter column mappings
13 |     drop default;
14 | 
15 | alter table origin_intrinsic_metadata
16 |   add column mappings text array not null default '{}';
17 | alter table origin_intrinsic_metadata
18 |   alter column mappings
19 |     drop default;
20 | 
21 | 
22 | create or replace function swh_revision_metadata_add(conflict_update boolean)
23 |     returns void
24 |     language plpgsql
25 | as $$
26 | begin
27 |     if conflict_update then
28 |       insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id)
29 |       select id, translated_metadata, mappings, indexer_configuration_id
30 |     	from tmp_revision_metadata tcm
31 |             on conflict(id, indexer_configuration_id)
32 |                 do update set translated_metadata = excluded.translated_metadata;
33 | 
34 |     else
35 |         insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id)
36 |         select id, translated_metadata, mappings, indexer_configuration_id
37 |     	from tmp_revision_metadata tcm
38 |             on conflict(id, indexer_configuration_id)
39 |             do nothing;
40 |     end if;
41 |     return;
42 | end
43 | $$;
44 | 
45 | 
46 | create or replace function swh_origin_intrinsic_metadata_add(
47 |         conflict_update boolean)
48 |     returns void
49 |     language plpgsql
50 | as $$
51 | begin
52 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
53 |     if conflict_update then
54 |       insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
55 |       select origin_id, metadata, indexer_configuration_id, from_revision,
56 |              metadata_tsvector, mappings
57 |     	from tmp_origin_intrinsic_metadata
58 |             on conflict(origin_id, indexer_configuration_id)
59 |                 do update set metadata = excluded.metadata;
60 | 
61 |     else
62 |         insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
63 |         select origin_id, metadata, indexer_configuration_id, from_revision,
64 |                metadata_tsvector, mappings
65 |     	from tmp_origin_intrinsic_metadata
66 |             on conflict(origin_id, indexer_configuration_id)
67 |             do nothing;
68 |     end if;
69 |     return;
70 | end
71 | $$;
72 | 
73 | 
74 | -- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata.
75 | --
76 | -- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
77 | -- so it should be suitable for proper names and non-English text.
78 | create or replace function swh_origin_intrinsic_metadata_compute_tsvector()
79 |     returns void
80 |     language plpgsql
81 | as $$
82 | begin
83 |     update tmp_origin_intrinsic_metadata
84 |         set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
85 | end
86 | $$;
87 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/120.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 119
 3 | -- to_version: 120
 4 | -- description: fix updates of the 'mappings' column in metadata tables
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(120, now(), 'Work In Progress');
 8 | 
 9 | create or replace function swh_revision_metadata_add(conflict_update boolean)
10 |     returns void
11 |     language plpgsql
12 | as $$
13 | begin
14 |     if conflict_update then
15 |       insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id)
16 |       select id, translated_metadata, mappings, indexer_configuration_id
17 |     	from tmp_revision_metadata tcm
18 |             on conflict(id, indexer_configuration_id)
19 |                 do update set
20 |                     translated_metadata = excluded.translated_metadata,
21 |                     mappings = excluded.mappings;
22 | 
23 |     else
24 |         insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id)
25 |         select id, translated_metadata, mappings, indexer_configuration_id
26 |     	from tmp_revision_metadata tcm
27 |             on conflict(id, indexer_configuration_id)
28 |             do nothing;
29 |     end if;
30 |     return;
31 | end
32 | $$;
33 | 
34 | create or replace function swh_origin_intrinsic_metadata_add(
35 |         conflict_update boolean)
36 |     returns void
37 |     language plpgsql
38 | as $$
39 | begin
40 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
41 |     if conflict_update then
42 |       insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
43 |       select origin_id, metadata, indexer_configuration_id, from_revision,
44 |              metadata_tsvector, mappings
45 |     	from tmp_origin_intrinsic_metadata
46 |             on conflict(origin_id, indexer_configuration_id)
47 |                 do update set
48 |                     metadata = excluded.metadata,
49 |                     mappings = excluded.mappings;
50 | 
51 |     else
52 |         insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
53 |         select origin_id, metadata, indexer_configuration_id, from_revision,
54 |                metadata_tsvector, mappings
55 |     	from tmp_origin_intrinsic_metadata
56 |             on conflict(origin_id, indexer_configuration_id)
57 |             do nothing;
58 |     end if;
59 |     return;
60 | end
61 | $$;
62 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/121.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 120
 3 | -- to_version: 121
 4 | -- description: add comment to the 'mappings' column
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(121, now(), 'Work In Progress');
 8 | 
 9 | comment on column revision_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
10 | comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
11 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/122.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 121
 3 | -- to_version: 122
 4 | -- description: add index to search origin_intrinsic_metadata for mappings.
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(122, now(), 'Work In Progress');
 8 | 
 9 | create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
10 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/123.sql:
--------------------------------------------------------------------------------
  1 | -- SWH Indexer DB schema upgrade
  2 | -- from_version: 122
  3 | -- to_version: 123
  4 | -- description: fix heterogeneity of names in metadata tables
  5 | 
  6 | insert into dbversion(version, release, description)
  7 | values(123, now(), 'Work In Progress');
  8 | 
  9 | create or replace function swh_content_metadata_add(conflict_update boolean)
 10 |     returns void
 11 |     language plpgsql
 12 | as $$
 13 | begin
 14 |     if conflict_update then
 15 |       insert into content_metadata (id, metadata, indexer_configuration_id)
 16 |       select id, metadata, indexer_configuration_id
 17 |       from tmp_content_metadata tcm
 18 |             on conflict(id, indexer_configuration_id)
 19 |                 do update set metadata = excluded.metadata;
 20 | 
 21 |     else
 22 |         insert into content_metadata (id, metadata, indexer_configuration_id)
 23 |         select id, metadata, indexer_configuration_id
 24 |       from tmp_content_metadata tcm
 25 |             on conflict(id, indexer_configuration_id)
 26 |             do nothing;
 27 |     end if;
 28 |     return;
 29 | end
 30 | $$;
 31 | 
 32 | alter function swh_revision_metadata_add rename to swh_revision_intrinsic_metadata_add;
 33 | create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean)
 34 |     returns void
 35 |     language plpgsql
 36 | as $$
 37 | begin
 38 |     if conflict_update then
 39 |       insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
 40 |       select id, metadata, mappings, indexer_configuration_id
 41 |     	from tmp_revision_intrinsic_metadata tcm
 42 |             on conflict(id, indexer_configuration_id)
 43 |                 do update set
 44 |                     metadata = excluded.metadata,
 45 |                     mappings = excluded.mappings;
 46 | 
 47 |     else
 48 |         insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
 49 |         select id, metadata, mappings, indexer_configuration_id
 50 |     	from tmp_revision_intrinsic_metadata tcm
 51 |             on conflict(id, indexer_configuration_id)
 52 |             do nothing;
 53 |     end if;
 54 |     return;
 55 | end
 56 | $$;
 57 | 
 58 | alter function swh_mktemp_revision_metadata rename to swh_mktemp_revision_intrinsic_metadata;
 59 | create or replace function swh_mktemp_revision_intrinsic_metadata()
 60 |     returns void
 61 |     language sql
 62 | as $$
 63 |   create temporary table tmp_revision_intrinsic_metadata (
 64 |     like revision_intrinsic_metadata including defaults
 65 |   ) on commit drop;
 66 | $$;
 67 | 
 68 | create or replace function swh_origin_intrinsic_metadata_add(
 69 |         conflict_update boolean)
 70 |     returns void
 71 |     language plpgsql
 72 | as $$
 73 | begin
 74 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
 75 |     if conflict_update then
 76 |       insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
 77 |       select id, metadata, indexer_configuration_id, from_revision,
 78 |              metadata_tsvector, mappings
 79 |       from tmp_origin_intrinsic_metadata
 80 |             on conflict(id, indexer_configuration_id)
 81 |                 do update set
 82 |                     metadata = excluded.metadata,
 83 |                     mappings = excluded.mappings;
 84 | 
 85 |     else
 86 |         insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
 87 |         select id, metadata, indexer_configuration_id, from_revision,
 88 |                metadata_tsvector, mappings
 89 |       from tmp_origin_intrinsic_metadata
 90 |             on conflict(id, indexer_configuration_id)
 91 |             do nothing;
 92 |     end if;
 93 |     return;
 94 | end
 95 | $$;
 96 | 
 97 | alter index revision_metadata_pkey rename to revision_intrinsic_metadata_pkey;
 98 | 
 99 | alter table revision_metadata rename column translated_metadata to metadata;
100 | alter table content_metadata rename column translated_metadata to metadata;
101 | alter table origin_intrinsic_metadata rename column origin_id to id;
102 | 
103 | alter table revision_metadata rename to revision_intrinsic_metadata;
104 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/124.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 123
 3 | -- to_version: 124
 4 | -- description: drop constraint that origin_intrinsic_metadata references an existing revision_intrinsic_metadata.
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(124, now(), 'Work In Progress');
 8 | 
 9 | alter table origin_intrinsic_metadata drop constraint origin_intrinsic_metadata_revision_metadata_fkey;
10 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/125.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 124
 3 | -- to_version: 125
 4 | -- description: Add 'origin_url' column to origin_intrinsic_metadata.
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(125, now(), 'Work In Progress');
 8 | 
 9 | alter table origin_intrinsic_metadata
10 |     add column origin_url text;
11 | 
12 | create or replace function swh_origin_intrinsic_metadata_add(
13 |         conflict_update boolean)
14 |     returns void
15 |     language plpgsql
16 | as $$
17 | begin
18 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
19 |     if conflict_update then
20 |       insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
21 |       select id, origin_url, metadata, indexer_configuration_id, from_revision,
22 |              metadata_tsvector, mappings
23 |         from tmp_origin_intrinsic_metadata
24 |             on conflict(id, indexer_configuration_id)
25 |                 do update set
26 |                     metadata = excluded.metadata,
27 |                     mappings = excluded.mappings;
28 | 
29 |     else
30 |         insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
31 |         select id, origin_url, metadata, indexer_configuration_id, from_revision,
32 |                metadata_tsvector, mappings
33 |         from tmp_origin_intrinsic_metadata
34 |             on conflict(id, indexer_configuration_id)
35 |             do nothing;
36 |     end if;
37 |     return;
38 | end
39 | $$;
40 | 
41 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
42 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/126.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 125
 3 | -- to_version: 126
 4 | -- description: Make swh_origin_intrinsic_metadata_add update all fields
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(126, now(), 'Work In Progress');
 8 | 
 9 | 
10 | create or replace function swh_origin_intrinsic_metadata_add(
11 |         conflict_update boolean)
12 |     returns void
13 |     language plpgsql
14 | as $$
15 | begin
16 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
17 |     if conflict_update then
18 |       insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
19 |       select id, origin_url, metadata, indexer_configuration_id, from_revision,
20 |              metadata_tsvector, mappings
21 |     	from tmp_origin_intrinsic_metadata
22 |             on conflict(id, indexer_configuration_id)
23 |                 do update set
24 |                     metadata = excluded.metadata,
25 |                     metadata_tsvector = excluded.metadata_tsvector,
26 |                     mappings = excluded.mappings,
27 |                     origin_url = excluded.origin_url,
28 |                     from_revision = excluded.from_revision;
29 | 
30 |     else
31 |         insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
32 |         select id, origin_url, metadata, indexer_configuration_id, from_revision,
33 |                metadata_tsvector, mappings
34 |     	from tmp_origin_intrinsic_metadata
35 |             on conflict(id, indexer_configuration_id)
36 |             do nothing;
37 |     end if;
38 |     return;
39 | end
40 | $$;
41 | 
42 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
43 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/127.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 126
 3 | -- to_version: 127
 4 | -- description: Remove swh_origin_intrinsic_metadata_add origin_url field and
 5 | --              replace id by the former content of origin_url
 6 | 
 7 | insert into dbversion(version, release, description)
 8 | values(127, now(), 'Work In Progress');
 9 | 
10 | -- replace id column by origin_url
11 | alter table origin_intrinsic_metadata
12 |       drop constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
13 | alter table origin_intrinsic_metadata
14 |       drop constraint origin_intrinsic_metadata_pkey;
15 | alter table origin_intrinsic_metadata
16 |       drop column id;
17 | alter table origin_intrinsic_metadata
18 |       rename column origin_url to id;
19 | comment on column origin_intrinsic_metadata.id is 'url of the origin';
20 | 
21 | -- replace functions that operate on this table
22 | create or replace function swh_origin_intrinsic_metadata_add(
23 |         conflict_update boolean)
24 |     returns void
25 |     language plpgsql
26 | as $$
27 | begin
28 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
29 |     if conflict_update then
30 |       insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
31 |       select id, metadata, indexer_configuration_id, from_revision,
32 |              metadata_tsvector, mappings
33 |     	from tmp_origin_intrinsic_metadata
34 |             on conflict(id, indexer_configuration_id)
35 |                 do update set
36 |                     metadata = excluded.metadata,
37 |                     metadata_tsvector = excluded.metadata_tsvector,
38 |                     mappings = excluded.mappings,
39 |                     from_revision = excluded.from_revision;
40 | 
41 |     else
42 |         insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
43 |         select id, metadata, indexer_configuration_id, from_revision,
44 |                metadata_tsvector, mappings
45 |     	from tmp_origin_intrinsic_metadata
46 |             on conflict(id, indexer_configuration_id)
47 |             do nothing;
48 |     end if;
49 |     return;
50 | end
51 | $$;
52 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
53 | 
54 | -- recreate indexes/constraints on this table
55 | create unique index origin_intrinsic_metadata_pkey
56 |        on origin_intrinsic_metadata(id, indexer_configuration_id);
57 | alter table origin_intrinsic_metadata
58 |       add primary key using index origin_intrinsic_metadata_pkey;
59 | 
60 | alter table origin_intrinsic_metadata
61 |       add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
62 | alter table origin_intrinsic_metadata
63 |       validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
64 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/128.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 127
 3 | -- to_version: 128
 4 | -- description: Add index on content_mimetype table to improve read queries
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(128, now(), 'Work In Progress');
 8 | 
 9 | create index on content_mimetype(id) where mimetype like 'text/%';
10 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/129.sql:
--------------------------------------------------------------------------------
  1 | -- SWH Indexer DB schema upgrade
  2 | -- from_version: 128
  3 | -- to_version: 129
  4 | -- description:
  5 | 
  6 | insert into dbversion(version, release, description)
  7 | values(129, now(), 'Work In Progress');
  8 | 
  9 | create or replace function swh_mktemp(tblname regclass)
 10 |     returns void
 11 |     language plpgsql
 12 | as $$
 13 | begin
 14 |     execute format('
 15 | 	create temporary table if not exists tmp_%1$I
 16 | 	    (like %1$I including defaults)
 17 | 	    on commit delete rows;
 18 |       alter table tmp_%1$I drop column if exists object_id;
 19 | 	', tblname);
 20 |     return;
 21 | end
 22 | $$;
 23 | 
 24 | -- create a temporary table for content_mimetype tmp_content_mimetype,
 25 | create or replace function swh_mktemp_content_mimetype()
 26 |     returns void
 27 |     language sql
 28 | as $$
 29 |   create temporary table if not exists tmp_content_mimetype (
 30 |     like content_mimetype including defaults
 31 |   ) on commit delete rows;
 32 | $$;
 33 | 
 34 | -- create a temporary table for retrieving content_language
 35 | create or replace function swh_mktemp_content_language()
 36 |     returns void
 37 |     language sql
 38 | as $$
 39 |   create temporary table if not exists tmp_content_language (
 40 |     like content_language including defaults
 41 |   ) on commit delete rows;
 42 | $$;
 43 | 
 44 | comment on function swh_mktemp_content_language() is 'Helper table to add content language';
 45 | 
 46 | 
 47 | -- create a temporary table for content_ctags tmp_content_ctags,
 48 | create or replace function swh_mktemp_content_ctags()
 49 |     returns void
 50 |     language sql
 51 | as $$
 52 |   create temporary table if not exists tmp_content_ctags (
 53 |     like content_ctags including defaults
 54 |   ) on commit delete rows;
 55 | $$;
 56 | 
 57 | comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags';
 58 | 
 59 | -- create a temporary table for content_fossology_license tmp_content_fossology_license,
 60 | create or replace function swh_mktemp_content_fossology_license()
 61 |     returns void
 62 |     language sql
 63 | as $$
 64 |   create temporary table if not exists tmp_content_fossology_license (
 65 |     id                       sha1,
 66 |     license                  text,
 67 |     indexer_configuration_id integer
 68 |   ) on commit delete rows;
 69 | $$;
 70 | 
 71 | comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license';
 72 | 
 73 | 
 74 | -- create a temporary table for retrieving content_metadata
 75 | create or replace function swh_mktemp_content_metadata()
 76 |     returns void
 77 |     language sql
 78 | as $$
 79 |   create temporary table if not exists tmp_content_metadata (
 80 |     like content_metadata including defaults
 81 |   ) on commit delete rows;
 82 | $$;
 83 | 
 84 | comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';
 85 | 
 86 | 
 87 | -- create a temporary table for retrieving revision_intrinsic_metadata
 88 | create or replace function swh_mktemp_revision_intrinsic_metadata()
 89 |     returns void
 90 |     language sql
 91 | as $$
 92 |   create temporary table if not exists tmp_revision_intrinsic_metadata (
 93 |     like revision_intrinsic_metadata including defaults
 94 |   ) on commit delete rows;
 95 | $$;
 96 | 
 97 | comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata';
 98 | 
 99 | -- create a temporary table for retrieving origin_intrinsic_metadata
100 | create or replace function swh_mktemp_origin_intrinsic_metadata()
101 |     returns void
102 |     language sql
103 | as $$
104 |   create temporary table if not exists tmp_origin_intrinsic_metadata (
105 |     like origin_intrinsic_metadata including defaults
106 |   ) on commit delete rows;
107 | $$;
108 | 
109 | comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata';
110 | 
111 | create or replace function swh_mktemp_indexer_configuration()
112 |     returns void
113 |     language sql
114 | as $$
115 |     create temporary table if not exists tmp_indexer_configuration (
116 |       like indexer_configuration including defaults
117 |     ) on commit delete rows;
118 |     alter table tmp_indexer_configuration drop column if exists id;
119 | $$;
120 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/130.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 129
 3 | -- to_version: 130
 4 | -- description:
 5 | 
 6 | insert into dbversion(version, release, description)
 7 | values(130, now(), 'Work In Progress');
 8 | 
 9 | create or replace function swh_content_fossology_license_add(conflict_update boolean)
10 |     returns void
11 |     language plpgsql
12 | as $$
13 | begin
14 |     -- insert unknown licenses first
15 |     insert into fossology_license (name)
16 |     select distinct license from tmp_content_fossology_license tmp
17 |     where not exists (select 1 from fossology_license where name=tmp.license)
18 |     on conflict(name) do nothing;
19 | 
20 |     if conflict_update then
21 |         insert into content_fossology_license (id, license_id, indexer_configuration_id)
22 |         select tcl.id,
23 |               (select id from fossology_license where name = tcl.license) as license,
24 |               indexer_configuration_id
25 |         from tmp_content_fossology_license tcl
26 |             on conflict(id, license_id, indexer_configuration_id)
27 |             do update set license_id = excluded.license_id;
28 |         return;
29 |     end if;
30 | 
31 |     insert into content_fossology_license (id, license_id, indexer_configuration_id)
32 |     select tcl.id,
33 |           (select id from fossology_license where name = tcl.license) as license,
34 |           indexer_configuration_id
35 |     from tmp_content_fossology_license tcl
36 |         on conflict(id, license_id, indexer_configuration_id)
37 |         do nothing;
38 |     return;
39 | end
40 | $$;
41 | 
42 | comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
43 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/132.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 131
 3 | -- to_version: 132
 4 | -- description: _add function returns the inserted rows
 5 | 
 6 | insert into dbversion(version, release, description)
 7 |       values(132, now(), 'Work In Progress');
 8 | 
 9 | create or replace function swh_content_fossology_license_add(conflict_update boolean)
10 |   returns bigint
11 |   language plpgsql
12 | as $$
13 | declare
14 |   res bigint;
15 | begin
16 |     -- insert unknown licenses first
17 |     insert into fossology_license (name)
18 |     select distinct license from tmp_content_fossology_license tmp
19 |     where not exists (select 1 from fossology_license where name=tmp.license)
20 |     on conflict(name) do nothing;
21 | 
22 |     if conflict_update then
23 |         insert into content_fossology_license (id, license_id, indexer_configuration_id)
24 |         select tcl.id,
25 |               (select id from fossology_license where name = tcl.license) as license,
26 |               indexer_configuration_id
27 |         from tmp_content_fossology_license tcl
28 |         on conflict(id, license_id, indexer_configuration_id)
29 |         do update set license_id = excluded.license_id;
30 |     else
31 |         insert into content_fossology_license (id, license_id, indexer_configuration_id)
32 |         select tcl.id,
33 |               (select id from fossology_license where name = tcl.license) as license,
34 |               indexer_configuration_id
35 |         from tmp_content_fossology_license tcl
36 |         on conflict(id, license_id, indexer_configuration_id)
37 |         do nothing;
38 |     end if;
39 | 
40 |     get diagnostics res = ROW_COUNT;
41 |     return res;
42 | end
43 | $$;
44 | 
45 | comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
46 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/133.sql:
--------------------------------------------------------------------------------
  1 | -- SWH Indexer DB schema upgrade
  2 | -- from_version: 132
  3 | -- to_version: 133
  4 | -- description: remove 'conflict_update' argument
  5 | 
  6 | insert into dbversion(version, release, description)
  7 |       values(133, now(), 'Work In Progress');
  8 | 
  9 | drop function swh_content_mimetype_add(conflict_update boolean);
 10 | create or replace function swh_content_mimetype_add()
 11 |     returns bigint
 12 |     language plpgsql
 13 | as $$
 14 | declare
 15 |   res bigint;
 16 | begin
 17 |     insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
 18 |     select id, mimetype, encoding, indexer_configuration_id
 19 |     from tmp_content_mimetype tcm
 20 |     on conflict(id, indexer_configuration_id)
 21 |     do update set mimetype = excluded.mimetype,
 22 |                   encoding = excluded.encoding;
 23 | 
 24 |     get diagnostics res = ROW_COUNT;
 25 |     return res;
 26 | end
 27 | $$;
 28 | 
 29 | comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
 30 | 
 31 | 
 32 | 
 33 | drop function swh_content_language_add(conflict_update boolean);
 34 | create or replace function swh_content_language_add()
 35 |     returns bigint
 36 |     language plpgsql
 37 | as $$
 38 | declare
 39 |   res bigint;
 40 | begin
 41 |     insert into content_language (id, lang, indexer_configuration_id)
 42 |     select id, lang, indexer_configuration_id
 43 |     from tmp_content_language tcl
 44 |     on conflict(id, indexer_configuration_id)
 45 |     do update set lang = excluded.lang;
 46 | 
 47 |     get diagnostics res = ROW_COUNT;
 48 |     return res;
 49 | end
 50 | $$;
 51 | 
 52 | comment on function swh_content_language_add() IS 'Add new content languages';
 53 | 
 54 | 
 55 | 
 56 | drop function swh_content_ctags_add(conflict_update boolean);
 57 | create or replace function swh_content_ctags_add()
 58 |     returns bigint
 59 |     language plpgsql
 60 | as $$
 61 | declare
 62 |   res bigint;
 63 | begin
 64 |     insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
 65 |     select id, name, kind, line, lang, indexer_configuration_id
 66 |     from tmp_content_ctags tct
 67 |     on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
 68 |     do nothing;
 69 | 
 70 |     get diagnostics res = ROW_COUNT;
 71 |     return res;
 72 | end
 73 | $$;
 74 | 
 75 | comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
 76 | 
 77 | 
 78 | 
 79 | drop  function swh_content_fossology_license_add(conflict_update boolean);
 80 | create or replace function swh_content_fossology_license_add()
 81 |    returns bigint
 82 |    language plpgsql
 83 | as $$
 84 | declare
 85 |   res bigint;
 86 | begin
 87 |     -- insert unknown licenses first
 88 |     insert into fossology_license (name)
 89 |     select distinct license from tmp_content_fossology_license tmp
 90 |     where not exists (select 1 from fossology_license where name=tmp.license)
 91 |     on conflict(name) do nothing;
 92 | 
 93 |     insert into content_fossology_license (id, license_id, indexer_configuration_id)
 94 |     select tcl.id,
 95 |           (select id from fossology_license where name = tcl.license) as license,
 96 |           indexer_configuration_id
 97 |     from tmp_content_fossology_license tcl
 98 |     on conflict(id, license_id, indexer_configuration_id)
 99 |     do update set license_id = excluded.license_id;
100 | 
101 |     get diagnostics res = ROW_COUNT;
102 |     return res;
103 | end
104 | $$;
105 | 
106 | comment on function swh_content_fossology_license_add() IS 'Add new content licenses';
107 | 
108 | 
109 | 
110 | drop function swh_content_metadata_add(conflict_update boolean);
111 | create or replace function swh_content_metadata_add()
112 |     returns bigint
113 |     language plpgsql
114 | as $$
115 | declare
116 |   res bigint;
117 | begin
118 |     insert into content_metadata (id, metadata, indexer_configuration_id)
119 |     select id, metadata, indexer_configuration_id
120 |     from tmp_content_metadata tcm
121 |     on conflict(id, indexer_configuration_id)
122 |     do update set metadata = excluded.metadata;
123 | 
124 |     get diagnostics res = ROW_COUNT;
125 |     return res;
126 | end
127 | $$;
128 | 
129 | comment on function swh_content_metadata_add() IS 'Add new content metadata';
130 | 
131 | 
132 | 
133 | drop function swh_revision_intrinsic_metadata_add(conflict_update boolean);
134 | create or replace function swh_revision_intrinsic_metadata_add()
135 |     returns bigint
136 |     language plpgsql
137 | as $$
138 | declare
139 |   res bigint;
140 | begin
141 |     insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
142 |     select id, metadata, mappings, indexer_configuration_id
143 |     from tmp_revision_intrinsic_metadata tcm
144 |     on conflict(id, indexer_configuration_id)
145 |     do update set
146 |         metadata = excluded.metadata,
147 |         mappings = excluded.mappings;
148 | 
149 |     get diagnostics res = ROW_COUNT;
150 |     return res;
151 | end
152 | $$;
153 | 
154 | comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
155 | 
156 | 
157 | 
158 | drop function swh_origin_intrinsic_metadata_add(conflict_update boolean);
159 | create or replace function swh_origin_intrinsic_metadata_add()
160 |     returns bigint
161 |     language plpgsql
162 | as $$
163 | declare
164 |    res bigint;
165 | begin
166 |     perform swh_origin_intrinsic_metadata_compute_tsvector();
167 | 
168 |     insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
169 |     select id, metadata, indexer_configuration_id, from_revision,
170 |            metadata_tsvector, mappings
171 |     from tmp_origin_intrinsic_metadata
172 |     on conflict(id, indexer_configuration_id)
173 |     do update set
174 |         metadata = excluded.metadata,
175 |         metadata_tsvector = excluded.metadata_tsvector,
176 |         mappings = excluded.mappings,
177 |         from_revision = excluded.from_revision;
178 | 
179 |     get diagnostics res = ROW_COUNT;
180 |     return res;
181 | end
182 | $$;
183 | 
184 | comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
185 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/135.sql:
--------------------------------------------------------------------------------
  1 | -- SWH Indexer DB schema upgrade
  2 | -- from_version: 134
  3 | -- to_version: 135
  4 | -- description: Add support for origin_extrinsic_metadata
  5 | 
  6 | insert into dbversion(version, release, description)
  7 |       values(135, now(), 'Work In Progress');
  8 | 
  9 | create table origin_extrinsic_metadata(
 10 |   id                        text       not null,  -- origin url
 11 |   metadata                  jsonb,
 12 |   indexer_configuration_id  bigint     not null,
 13 |   from_remd_id              sha1_git   not null,
 14 |   metadata_tsvector         tsvector,
 15 |   mappings                  text array not null
 16 | );
 17 | 
 18 | comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin';
 19 | comment on column origin_extrinsic_metadata.id is 'url of the origin';
 20 | comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory';
 21 | comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
 22 | comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.';
 23 | comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)';
 24 | 
 25 | -- create a temporary table for retrieving origin_extrinsic_metadata
 26 | create or replace function swh_mktemp_origin_extrinsic_metadata()
 27 |     returns void
 28 |     language sql
 29 | as $$
 30 |   create temporary table if not exists tmp_origin_extrinsic_metadata (
 31 |     like origin_extrinsic_metadata including defaults
 32 |   ) on commit delete rows;
 33 | $$;
 34 | 
 35 | comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata';
 36 | 
 37 | create or replace function swh_mktemp_indexer_configuration()
 38 |     returns void
 39 |     language sql
 40 | as $$
 41 |     create temporary table if not exists tmp_indexer_configuration (
 42 |       like indexer_configuration including defaults
 43 |     ) on commit delete rows;
 44 |     alter table tmp_indexer_configuration drop column if exists id;
 45 | $$;
 46 | 
 47 | -- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata,
 48 | -- overwriting duplicates.
 49 | --
 50 | -- If filtering duplicates is in order, the call to
 51 | -- swh_origin_extrinsic_metadata_missing must take place before calling this
 52 | -- function.
 53 | --
 54 | -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
 55 | -- tmp_origin_extrinsic_metadata, 2. call this function
 56 | create or replace function swh_origin_extrinsic_metadata_add()
 57 |     returns bigint
 58 |     language plpgsql
 59 | as $$
 60 | declare
 61 |   res bigint;
 62 | begin
 63 |     perform swh_origin_extrinsic_metadata_compute_tsvector();
 64 | 
 65 |     insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings)
 66 |     select id, metadata, indexer_configuration_id, from_remd_id,
 67 |            metadata_tsvector, mappings
 68 |     from tmp_origin_extrinsic_metadata
 69 |     on conflict(id, indexer_configuration_id)
 70 |     do update set
 71 |         metadata = excluded.metadata,
 72 |         metadata_tsvector = excluded.metadata_tsvector,
 73 |         mappings = excluded.mappings,
 74 |         from_remd_id = excluded.from_remd_id;
 75 | 
 76 |     get diagnostics res = ROW_COUNT;
 77 |     return res;
 78 | end
 79 | $$;
 80 | 
 81 | comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata';
 82 | 
 83 | 
 84 | -- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata.
 85 | --
 86 | -- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
 87 | -- so it should be suitable for proper names and non-English text.
 88 | create or replace function swh_origin_extrinsic_metadata_compute_tsvector()
 89 |     returns void
 90 |     language plpgsql
 91 | as $$
 92 | begin
 93 |     update tmp_origin_extrinsic_metadata
 94 |         set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
 95 | end
 96 | $$;
 97 | 
 98 | -- origin_extrinsic_metadata
 99 | create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id);
100 | alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey;
101 | 
102 | alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
103 | alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey;
104 | 
105 | create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector);
106 | create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings);
107 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/sql/upgrades/137.sql:
--------------------------------------------------------------------------------
 1 | -- SWH Indexer DB schema upgrade
 2 | -- from_version: 136
 3 | -- to_version: 137
 4 | -- description: Drop content_language and content_ctags tables and related functions
 5 | 
 6 | drop function if exists swh_content_language_add;
 7 | drop function if exists swh_mktemp_content_language();
 8 | drop function if exists swh_mktemp_content_ctags();
 9 | drop function if exists swh_content_ctags_add();
10 | drop function if exists swh_content_ctags_search;
11 | 
12 | drop type if exists content_ctags_signature;
13 | 
14 | drop table if exists content_language;
15 | drop table if exists content_ctags;
16 | 
17 | drop type if exists languages;
18 | drop type if exists ctags_languages;
19 | 
20 | 


--------------------------------------------------------------------------------
/swh/indexer/storage/writer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020-2022 The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from typing import Any, Dict, Iterable, Optional
 7 | 
 8 | try:
 9 |     from swh.journal.writer import JournalWriterInterface, get_journal_writer
10 | except ImportError:
11 |     get_journal_writer = None  # type: ignore
12 |     # mypy limitation, see https://github.com/python/mypy/issues/1153
13 | 
14 | from .model import BaseRow
15 | 
16 | 
17 | class JournalWriter:
18 |     """Journal writer storage collaborator. It's in charge of adding objects to
19 |     the journal.
20 | 
21 |     """
22 | 
23 |     journal: Optional[JournalWriterInterface]
24 | 
25 |     def __init__(self, journal_writer: Dict[str, Any]):
26 |         """
27 |         Args:
28 |             journal_writer: configuration passed to
29 |                             `swh.journal.writer.get_journal_writer`
30 |         """
31 |         if journal_writer:
32 |             if get_journal_writer is None:
33 |                 raise EnvironmentError(
34 |                     "You need the swh.journal package to use the "
35 |                     "journal_writer feature"
36 |                 )
37 |             self.journal = get_journal_writer(
38 |                 **journal_writer,
39 |                 value_sanitizer=lambda object_type, value_dict: value_dict,
40 |             )
41 |         else:
42 |             self.journal = None
43 | 
44 |     def write_additions(self, obj_type, entries: Iterable[BaseRow]) -> None:
45 |         if not self.journal:
46 |             return
47 | 
48 |         translated = []
49 | 
50 |         for entry in entries:
51 |             assert entry.object_type == obj_type  # type: ignore
52 | 
53 |             # ids are internal to the database and should not be sent to postgresql
54 |             if entry.indexer_configuration_id is not None:
55 |                 raise ValueError(
56 |                     f"{entry} passed to JournalWriter.write_additions has "
57 |                     f"indexer_configuration_id instead of full tool dict"
58 |                 )
59 |             assert entry.tool, "Missing both indexer_configuration_id and tool dict"
60 |             if "id" in entry.tool:
61 |                 raise ValueError(
62 |                     f"{entry} passed to JournalWriter.write_additions "
63 |                     f"contains a tool id"
64 |                 )
65 | 
66 |             translated.append(entry)
67 | 
68 |         # write to kafka
69 |         self.journal.write_additions(obj_type, translated)
70 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from os import path
2 | 
3 | import swh.indexer
4 | 
5 | __all__ = ["start_worker_thread"]
6 | 
7 | SQL_DIR = path.join(path.dirname(swh.indexer.__file__), "sql")
8 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2019-2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from functools import partial
  7 | import os
  8 | from unittest.mock import patch
  9 | 
 10 | import pytest
 11 | from pytest_postgresql import factories
 12 | import yaml
 13 | 
 14 | from swh.core.db.db_utils import initialize_database_for_module
 15 | from swh.indexer.storage import IndexerStorage, get_indexer_storage
 16 | from swh.objstorage.factory import get_objstorage
 17 | from swh.storage import get_storage
 18 | 
 19 | from .utils import fill_obj_storage, fill_storage
 20 | 
 21 | idx_postgresql_proc = factories.postgresql_proc(
 22 |     load=[
 23 |         partial(
 24 |             initialize_database_for_module,
 25 |             modname="indexer.storage",
 26 |             version=IndexerStorage.current_version,
 27 |         )
 28 |     ],
 29 | )
 30 | 
 31 | idx_storage_postgresql = factories.postgresql("idx_postgresql_proc")
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def idx_storage_backend_config(idx_storage_postgresql):
 36 |     """Basic pg storage configuration with no journal collaborator for the indexer
 37 |     storage (to avoid pulling optional dependency on clients of this fixture)
 38 | 
 39 |     """
 40 |     return {
 41 |         "cls": "postgresql",
 42 |         "db": idx_storage_postgresql.info.dsn,
 43 |     }
 44 | 
 45 | 
 46 | @pytest.fixture
 47 | def swh_indexer_config(
 48 |     swh_storage_backend_config,
 49 |     idx_storage_backend_config,
 50 | ):
 51 |     return {
 52 |         "storage": swh_storage_backend_config,
 53 |         "objstorage": {"cls": "memory"},
 54 |         "indexer_storage": idx_storage_backend_config,
 55 |         "tools": {
 56 |             "name": "file",
 57 |             "version": "1:5.30-1+deb9u1",
 58 |             "configuration": {"type": "library", "debian-package": "python3-magic"},
 59 |         },
 60 |         "compute_checksums": ["blake2b512"],  # for rehash indexer
 61 |     }
 62 | 
 63 | 
 64 | @pytest.fixture
 65 | def idx_storage(swh_indexer_config):
 66 |     """An instance of in-memory indexer storage that gets injected into all
 67 |     indexers classes.
 68 | 
 69 |     """
 70 |     idx_storage_config = swh_indexer_config["indexer_storage"]
 71 |     return get_indexer_storage(**idx_storage_config)
 72 | 
 73 | 
 74 | @pytest.fixture
 75 | def storage(swh_indexer_config):
 76 |     """An instance of in-memory storage that gets injected into all indexers
 77 |     classes.
 78 | 
 79 |     """
 80 |     storage = get_storage(**swh_indexer_config["storage"])
 81 |     fill_storage(storage)
 82 |     return storage
 83 | 
 84 | 
 85 | @pytest.fixture
 86 | def obj_storage(swh_indexer_config):
 87 |     """An instance of in-memory objstorage that gets injected into all indexers
 88 |     classes.
 89 | 
 90 |     """
 91 |     objstorage = get_objstorage(**swh_indexer_config["objstorage"])
 92 |     fill_obj_storage(objstorage)
 93 |     with patch("swh.indexer.indexer.get_objstorage", return_value=objstorage):
 94 |         yield objstorage
 95 | 
 96 | 
 97 | @pytest.fixture
 98 | def swh_config(swh_indexer_config, monkeypatch, tmp_path):
 99 |     conffile = os.path.join(str(tmp_path), "indexer.yml")
100 |     with open(conffile, "w") as f:
101 |         f.write(yaml.dump(swh_indexer_config))
102 |     monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile)
103 |     return conffile
104 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/metadata_dictionary/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/tests/metadata_dictionary/__init__.py


--------------------------------------------------------------------------------
/swh/indexer/tests/metadata_dictionary/test_composer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2017-2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from swh.indexer.metadata_dictionary import MAPPINGS
 7 | 
 8 | 
 9 | def test_compute_metadata_composer():
10 |     raw_content = """{
11 | "name": "symfony/polyfill-mbstring",
12 | "type": "library",
13 | "description": "Symfony polyfill for the Mbstring extension",
14 | "keywords": [
15 |     "polyfill",
16 |     "shim",
17 |     "compatibility",
18 |     "portable"
19 | ],
20 | "homepage": "https://symfony.com",
21 | "license": "MIT",
22 | "authors": [
23 |     {
24 |         "name": "Nicolas Grekas",
25 |         "email": "p@tchwork.com"
26 |     },
27 |     {
28 |         "name": "Symfony Community",
29 |         "homepage": "https://symfony.com/contributors"
30 |     }
31 | ],
32 | "require": {
33 |     "php": ">=7.1"
34 | },
35 | "provide": {
36 |     "ext-mbstring": "*"
37 | },
38 | "autoload": {
39 |     "files": [
40 |         "bootstrap.php"
41 |     ]
42 | },
43 | "suggest": {
44 |     "ext-mbstring": "For best performance"
45 | },
46 | "minimum-stability": "dev",
47 | "extra": {
48 |     "branch-alias": {
49 |         "dev-main": "1.26-dev"
50 |     },
51 |     "thanks": {
52 |         "name": "symfony/polyfill",
53 |         "url": "https://github.com/symfony/polyfill"
54 |     }
55 | }
56 | }
57 |     """.encode(
58 |         "utf-8"
59 |     )
60 | 
61 |     result = MAPPINGS["ComposerMapping"]().translate(raw_content)
62 | 
63 |     assert set(result.pop("keywords")) == {
64 |         "polyfill",
65 |         "shim",
66 |         "compatibility",
67 |         "portable",
68 |     }, result
69 |     expected = {
70 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
71 |         "type": "SoftwareSourceCode",
72 |         "name": "symfony/polyfill-mbstring",
73 |         "description": "Symfony polyfill for the Mbstring extension",
74 |         "url": "https://symfony.com",
75 |         "license": "https://spdx.org/licenses/MIT",
76 |         "author": [
77 |             {
78 |                 "type": "Person",
79 |                 "name": "Nicolas Grekas",
80 |                 "email": "p@tchwork.com",
81 |             },
82 |             {
83 |                 "type": "Person",
84 |                 "name": "Symfony Community",
85 |             },
86 |         ],
87 |     }
88 | 
89 |     assert result == expected
90 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/metadata_dictionary/test_dart.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | import pytest
  7 | 
  8 | from swh.indexer.metadata_dictionary import MAPPINGS
  9 | 
 10 | 
 11 | def test_compute_metadata_pubspec():
 12 |     raw_content = b"""
 13 | ---
 14 | name: newtify
 15 | description: >-
 16 |   Have you been turned into a newt?  Would you like to be?
 17 |   This package can help. It has all of the
 18 |   newt-transmogrification functionality you have been looking
 19 |   for.
 20 | keywords:
 21 |   - polyfill
 22 |   - shim
 23 |   - compatibility
 24 |   - portable
 25 |   - mbstring
 26 | version: 1.2.3
 27 | license: MIT
 28 | homepage: https://example-pet-store.com/newtify
 29 | documentation: https://example-pet-store.com/newtify/docs
 30 | 
 31 | environment:
 32 |   sdk: '>=2.10.0 <3.0.0'
 33 | 
 34 | dependencies:
 35 |   efts: ^2.0.4
 36 |   transmogrify: ^0.4.0
 37 | 
 38 | dev_dependencies:
 39 |   test: '>=1.15.0 <2.0.0'
 40 |     """
 41 | 
 42 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
 43 | 
 44 |     assert set(result.pop("keywords")) == {
 45 |         "polyfill",
 46 |         "shim",
 47 |         "compatibility",
 48 |         "portable",
 49 |         "mbstring",
 50 |     }, result
 51 |     expected = {
 52 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 53 |         "type": "SoftwareSourceCode",
 54 |         "name": "newtify",
 55 |         "description": """Have you been turned into a newt?  Would you like to be? \
 56 | This package can help. It has all of the \
 57 | newt-transmogrification functionality you have been looking \
 58 | for.""",
 59 |         "url": "https://example-pet-store.com/newtify",
 60 |         "license": "https://spdx.org/licenses/MIT",
 61 |     }
 62 | 
 63 |     assert result == expected
 64 | 
 65 | 
 66 | def test_normalize_author_pubspec():
 67 |     raw_content = b"""
 68 |     author: Atlee Pine <atlee@example.org>
 69 |     """
 70 | 
 71 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
 72 | 
 73 |     expected = {
 74 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 75 |         "type": "SoftwareSourceCode",
 76 |         "author": [
 77 |             {"type": "Person", "name": "Atlee Pine", "email": "atlee@example.org"},
 78 |         ],
 79 |     }
 80 | 
 81 |     assert result == expected
 82 | 
 83 | 
 84 | def test_normalize_authors_pubspec():
 85 |     raw_content = b"""
 86 |     authors:
 87 |       - Vicky Merzown <vmz@example.org>
 88 |       - Ron Bilius Weasley
 89 |     """
 90 | 
 91 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
 92 | 
 93 |     expected = {
 94 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 95 |         "type": "SoftwareSourceCode",
 96 |         "author": [
 97 |             {"type": "Person", "name": "Vicky Merzown", "email": "vmz@example.org"},
 98 |             {
 99 |                 "type": "Person",
100 |                 "name": "Ron Bilius Weasley",
101 |             },
102 |         ],
103 |     }
104 | 
105 |     assert result == expected
106 | 
107 | 
108 | @pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
109 | def test_normalize_author_authors_pubspec():
110 |     raw_content = b"""
111 |     authors:
112 |       - Vicky Merzown <vmz@example.org>
113 |       - Ron Bilius Weasley
114 |     author: Hermione Granger
115 |     """
116 | 
117 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
118 | 
119 |     expected = {
120 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
121 |         "type": "SoftwareSourceCode",
122 |         "author": [
123 |             {"type": "Person", "name": "Vicky Merzown", "email": "vmz@example.org"},
124 |             {
125 |                 "type": "Person",
126 |                 "name": "Ron Bilius Weasley",
127 |             },
128 |             {
129 |                 "type": "Person",
130 |                 "name": "Hermione Granger",
131 |             },
132 |         ],
133 |     }
134 | 
135 |     assert result == expected
136 | 
137 | 
138 | def test_normalize_empty_authors():
139 |     raw_content = b"""
140 |     authors:
141 |     """
142 | 
143 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
144 | 
145 |     expected = {
146 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
147 |         "type": "SoftwareSourceCode",
148 |     }
149 | 
150 |     assert result == expected
151 | 
152 | 
153 | def test_invalid_yaml():
154 |     raw_content = b"""
155 |     name: smartech_push
156 |     license: { :type => "Commercial", :file => "LICENSE" }
157 |     """
158 | 
159 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
160 | 
161 |     assert result is None
162 | 
163 | 
164 | def test_invalid_tag():
165 |     raw_content = b"""
166 |     name: translatron
167 |     description: !BETA VERSION - NOT FOR LIVE OR PROD USAGE!
168 |     """
169 | 
170 |     result = MAPPINGS["PubMapping"]().translate(raw_content)
171 | 
172 |     assert result is None
173 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/metadata_dictionary/test_gitea.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from swh.indexer.metadata_dictionary import MAPPINGS
  7 | 
  8 | CONTEXT = [
  9 |     "https://doi.org/10.5063/schema/codemeta-2.0",
 10 |     {
 11 |         "as": "https://www.w3.org/ns/activitystreams#",
 12 |         "forge": "https://forgefed.org/ns#",
 13 |         "xsd": "http://www.w3.org/2001/XMLSchema#",
 14 |     },
 15 | ]
 16 | 
 17 | 
 18 | def test_compute_metadata_none():
 19 |     """
 20 |     testing content empty content is empty
 21 |     should return None
 22 |     """
 23 |     content = b""
 24 | 
 25 |     # None if no metadata was found or an error occurred
 26 |     declared_metadata = None
 27 |     result = MAPPINGS["GiteaMapping"]().translate(content)
 28 |     assert declared_metadata == result
 29 | 
 30 | 
 31 | def test_supported_terms():
 32 |     terms = MAPPINGS["GiteaMapping"].supported_terms()
 33 |     assert {
 34 |         "http://schema.org/name",
 35 |         "http://schema.org/dateCreated",
 36 |         "https://forgefed.org/ns#forks",
 37 |         "https://www.w3.org/ns/activitystreams#totalItems",
 38 |     } <= terms
 39 | 
 40 | 
 41 | def test_compute_metadata_gitea():
 42 |     content = b"""
 43 | {
 44 |   "id": 48043,
 45 |   "owner": {
 46 |     "id": 48018,
 47 |     "login": "ForgeFed",
 48 |     "full_name": "",
 49 |     "email": "",
 50 |     "avatar_url": "https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33",
 51 |     "language": "",
 52 |     "is_admin": false,
 53 |     "last_login": "0001-01-01T00:00:00Z",
 54 |     "created": "2022-04-30T20:13:17+02:00",
 55 |     "restricted": false,
 56 |     "active": false,
 57 |     "prohibit_login": false,
 58 |     "location": "",
 59 |     "website": "https://forgefed.org/",
 60 |     "description": "",
 61 |     "visibility": "public",
 62 |     "followers_count": 0,
 63 |     "following_count": 0,
 64 |     "starred_repos_count": 0,
 65 |     "username": "ForgeFed"
 66 |   },
 67 |   "name": "ForgeFed",
 68 |   "full_name": "ForgeFed/ForgeFed",
 69 |   "description": "ActivityPub-based forge federation protocol specification",
 70 |   "empty": false,
 71 |   "private": false,
 72 |   "fork": false,
 73 |   "template": false,
 74 |   "parent": null,
 75 |   "mirror": false,
 76 |   "size": 3780,
 77 |   "language": "CSS",
 78 |   "languages_url": "https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages",
 79 |   "html_url": "https://codeberg.org/ForgeFed/ForgeFed",
 80 |   "ssh_url": "git@codeberg.org:ForgeFed/ForgeFed.git",
 81 |   "clone_url": "https://codeberg.org/ForgeFed/ForgeFed.git",
 82 |   "original_url": "https://notabug.org/peers/forgefed",
 83 |   "website": "https://forgefed.org",
 84 |   "stars_count": 30,
 85 |   "forks_count": 6,
 86 |   "watchers_count": 11,
 87 |   "open_issues_count": 61,
 88 |   "open_pr_counter": 10,
 89 |   "release_counter": 0,
 90 |   "default_branch": "main",
 91 |   "archived": false,
 92 |   "created_at": "2022-06-13T18:54:26+02:00",
 93 |   "updated_at": "2022-09-02T03:57:22+02:00",
 94 |   "permissions": {
 95 |     "admin": false,
 96 |     "push": false,
 97 |     "pull": true
 98 |   },
 99 |   "has_issues": true,
100 |   "internal_tracker": {
101 |     "enable_time_tracker": true,
102 |     "allow_only_contributors_to_track_time": true,
103 |     "enable_issue_dependencies": true
104 |   },
105 |   "has_wiki": false,
106 |   "has_pull_requests": true,
107 |   "has_projects": true,
108 |   "ignore_whitespace_conflicts": false,
109 |   "allow_merge_commits": false,
110 |   "allow_rebase": false,
111 |   "allow_rebase_explicit": false,
112 |   "allow_squash_merge": true,
113 |   "default_merge_style": "squash",
114 |   "avatar_url": "",
115 |   "internal": false,
116 |   "mirror_interval": "",
117 |   "mirror_updated": "0001-01-01T00:00:00Z",
118 |   "repo_transfer": null
119 | }
120 |     """
121 |     result = MAPPINGS["GiteaMapping"]().translate(content)
122 |     assert result == {
123 |         "@context": CONTEXT,
124 |         "type": "forge:Repository",
125 |         "id": "https://codeberg.org/ForgeFed/ForgeFed",
126 |         "forge:forks": {
127 |             "as:totalItems": {"type": "xsd:nonNegativeInteger", "@value": "6"},
128 |             "type": "as:OrderedCollection",
129 |         },
130 |         "as:likes": {
131 |             "as:totalItems": {
132 |                 "type": "xsd:nonNegativeInteger",
133 |                 "@value": "30",
134 |             },
135 |             "type": "as:Collection",
136 |         },
137 |         "as:followers": {
138 |             "as:totalItems": {
139 |                 "type": "xsd:nonNegativeInteger",
140 |                 "@value": "11",
141 |             },
142 |             "type": "as:Collection",
143 |         },
144 |         "name": "ForgeFed",
145 |         "description": "ActivityPub-based forge federation protocol specification",
146 |         "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed.git",
147 |         "dateCreated": "2022-06-13T18:54:26+02:00",
148 |         "dateModified": "2022-09-02T03:57:22+02:00",
149 |         "programmingLanguage": "CSS",
150 |         "url": "https://forgefed.org",
151 |     }
152 | 
153 | 
154 | def test_gitea_fork():
155 |     content = b"""
156 | {
157 |   "name": "fork-name",
158 |   "description": "fork description",
159 |   "html_url": "http://example.org/test-fork",
160 |   "parent": {
161 |     "name": "parent-name",
162 |     "description": "parent description",
163 |     "html_url": "http://example.org/test-software"
164 |   }
165 | }
166 |     """
167 |     result = MAPPINGS["GiteaMapping"]().translate(content)
168 |     assert result == {
169 |         "@context": CONTEXT,
170 |         "type": "forge:Repository",
171 |         "id": "http://example.org/test-fork",
172 |         "description": "fork description",
173 |         "name": "fork-name",
174 |         "forge:forkedFrom": {
175 |             "id": "http://example.org/test-software",
176 |         },
177 |     }
178 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/metadata_dictionary/test_python.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2017-2024  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from swh.indexer.metadata_dictionary import MAPPINGS
  7 | from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
  8 | from swh.model.hashutil import hash_to_bytes
  9 | from swh.objstorage.interface import CompositeObjId
 10 | 
 11 | 
 12 | def test_compute_metadata_pkginfo():
 13 |     raw_content = b"""\
 14 | Metadata-Version: 2.1
 15 | Name: swh.core
 16 | Version: 0.0.49
 17 | Summary: Software Heritage core utilities
 18 | Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
 19 | Author: Software Heritage developers
 20 | Author-email: swh-devel@inria.fr
 21 | License: UNKNOWN
 22 | Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 23 | Project-URL: Funding, https://www.softwareheritage.org/donate
 24 | Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
 25 | Description: swh-core
 26 |         ========
 27 |        \x20
 28 |         core library for swh's modules:
 29 |         - config parser
 30 |         - hash computations
 31 |         - serialization
 32 |         - logging mechanism
 33 |        \x20
 34 | Platform: UNKNOWN
 35 | Classifier: Programming Language :: Python :: 3
 36 | Classifier: Intended Audience :: Developers
 37 | Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 38 | Classifier: Operating System :: OS Independent
 39 | Classifier: Development Status :: 5 - Production/Stable
 40 | Description-Content-Type: text/markdown
 41 | Provides-Extra: testing
 42 | """  # noqa
 43 |     result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
 44 |     assert set(result.pop("description")) == {
 45 |         "Software Heritage core utilities",  # note the comma here
 46 |         "swh-core\n"
 47 |         "========\n"
 48 |         "\n"
 49 |         "core library for swh's modules:\n"
 50 |         "- config parser\n"
 51 |         "- hash computations\n"
 52 |         "- serialization\n"
 53 |         "- logging mechanism\n"
 54 |         "",
 55 |     }, result
 56 |     assert result == {
 57 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 58 |         "type": "SoftwareSourceCode",
 59 |         "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
 60 |         "name": "swh.core",
 61 |         "author": [
 62 |             {
 63 |                 "type": "Person",
 64 |                 "name": "Software Heritage developers",
 65 |                 "email": "swh-devel@inria.fr",
 66 |             }
 67 |         ],
 68 |         "version": "0.0.49",
 69 |     }
 70 | 
 71 | 
 72 | def test_compute_metadata_pkginfo_utf8():
 73 |     raw_content = b"""\
 74 | Metadata-Version: 1.1
 75 | Name: snowpyt
 76 | Description-Content-Type: UNKNOWN
 77 | Description: foo
 78 |         Hydrology N\xc2\xb083
 79 | """  # noqa
 80 |     result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
 81 |     assert result == {
 82 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 83 |         "type": "SoftwareSourceCode",
 84 |         "name": "snowpyt",
 85 |         "description": "foo\nHydrology N°83",
 86 |     }
 87 | 
 88 | 
 89 | def test_compute_metadata_pkginfo_keywords():
 90 |     raw_content = b"""\
 91 | Metadata-Version: 2.1
 92 | Name: foo
 93 | Keywords: foo bar baz
 94 | """  # noqa
 95 |     result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
 96 |     assert set(result.pop("keywords")) == {"foo", "bar", "baz"}, result
 97 |     assert result == {
 98 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 99 |         "type": "SoftwareSourceCode",
100 |         "name": "foo",
101 |     }
102 | 
103 | 
104 | def test_compute_metadata_pkginfo_license():
105 |     raw_content = b"""\
106 | Metadata-Version: 2.1
107 | Name: foo
108 | License: MIT
109 | """  # noqa
110 |     result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content)
111 |     assert result == {
112 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
113 |         "type": "SoftwareSourceCode",
114 |         "name": "foo",
115 |         "license": "https://spdx.org/licenses/MIT",
116 |     }
117 | 
118 | 
119 | def test_detect_metadata_files():
120 |     dir_entry = DirectoryLsEntry(
121 |         type="file",
122 |         name=b"PKG-INFO",
123 |         target=hash_to_bytes("1" * 40),
124 |         sha1=hash_to_bytes("2" * 40),
125 |     )
126 |     result = MAPPINGS["PythonPkginfoMapping"]().detect_metadata_files([dir_entry])
127 |     assert result == [CompositeObjId(sha1=dir_entry["sha1"])]
128 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/metadata_dictionary/test_ruby.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2017-2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from hypothesis import HealthCheck, given, settings, strategies
  7 | import pytest
  8 | 
  9 | from swh.indexer.metadata_dictionary import MAPPINGS
 10 | 
 11 | 
 12 | def test_gemspec_base():
 13 |     raw_content = b"""
 14 | Gem::Specification.new do |s|
 15 | s.name        = 'example'
 16 | s.version     = '0.1.0'
 17 | s.licenses    = ['MIT']
 18 | s.summary     = "This is an example!"
 19 | s.description = "Much longer explanation of the example!"
 20 | s.authors     = ["Ruby Coder"]
 21 | s.email       = 'rubycoder@example.com'
 22 | s.files       = ["lib/example.rb"]
 23 | s.homepage    = 'https://rubygems.org/gems/example'
 24 | s.metadata    = { "source_code_uri" => "https://github.com/example/example" }
 25 | end"""
 26 |     result = MAPPINGS["GemspecMapping"]().translate(raw_content)
 27 |     assert set(result.pop("description")) == {
 28 |         "This is an example!",
 29 |         "Much longer explanation of the example!",
 30 |     }
 31 |     assert result == {
 32 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 33 |         "type": "SoftwareSourceCode",
 34 |         "author": [{"type": "Person", "name": "Ruby Coder"}],
 35 |         "name": "example",
 36 |         "license": "https://spdx.org/licenses/MIT",
 37 |         "codeRepository": "https://rubygems.org/gems/example",
 38 |         "email": "rubycoder@example.com",
 39 |         "version": "0.1.0",
 40 |     }
 41 | 
 42 | 
 43 | @pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547")
 44 | def test_gemspec_two_author_fields():
 45 |     raw_content = b"""
 46 | Gem::Specification.new do |s|
 47 | s.authors     = ["Ruby Coder1"]
 48 | s.author      = "Ruby Coder2"
 49 | end"""
 50 |     result = MAPPINGS["GemspecMapping"]().translate(raw_content)
 51 |     assert result.pop("author") in (
 52 |         [
 53 |             {"type": "Person", "name": "Ruby Coder1"},
 54 |             {"type": "Person", "name": "Ruby Coder2"},
 55 |         ],
 56 |         [
 57 |             {"type": "Person", "name": "Ruby Coder2"},
 58 |             {"type": "Person", "name": "Ruby Coder1"},
 59 |         ],
 60 |     )
 61 |     assert result == {
 62 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 63 |         "type": "SoftwareSourceCode",
 64 |     }
 65 | 
 66 | 
 67 | def test_gemspec_invalid_author():
 68 |     raw_content = b"""
 69 | Gem::Specification.new do |s|
 70 | s.author      = ["Ruby Coder"]
 71 | end"""
 72 |     result = MAPPINGS["GemspecMapping"]().translate(raw_content)
 73 |     assert result == {
 74 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 75 |         "type": "SoftwareSourceCode",
 76 |     }
 77 |     raw_content = b"""
 78 | Gem::Specification.new do |s|
 79 | s.author      = "Ruby Coder1",
 80 | end"""
 81 |     result = MAPPINGS["GemspecMapping"]().translate(raw_content)
 82 |     assert result == {
 83 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 84 |         "type": "SoftwareSourceCode",
 85 |     }
 86 |     raw_content = b"""
 87 | Gem::Specification.new do |s|
 88 | s.authors     = ["Ruby Coder1", ["Ruby Coder2"]]
 89 | end"""
 90 |     result = MAPPINGS["GemspecMapping"]().translate(raw_content)
 91 |     assert result == {
 92 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
 93 |         "type": "SoftwareSourceCode",
 94 |         "author": [{"type": "Person", "name": "Ruby Coder1"}],
 95 |     }
 96 | 
 97 | 
 98 | def test_gemspec_alternative_header():
 99 |     raw_content = b"""
100 | require './lib/version'
101 | 
102 | Gem::Specification.new { |s|
103 | s.name = 'rb-system-with-aliases'
104 | s.summary = 'execute system commands with aliases'
105 | }
106 | """
107 |     result = MAPPINGS["GemspecMapping"]().translate(raw_content)
108 |     assert result == {
109 |         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
110 |         "type": "SoftwareSourceCode",
111 |         "name": "rb-system-with-aliases",
112 |         "description": "execute system commands with aliases",
113 |     }
114 | 
115 | 
116 | @settings(suppress_health_check=[HealthCheck.too_slow])
117 | @given(
118 |     strategies.dictionaries(
119 |         # keys
120 |         strategies.one_of(
121 |             strategies.text(),
122 |             *map(strategies.just, MAPPINGS["GemspecMapping"].mapping),  # type: ignore
123 |         ),
124 |         # values
125 |         strategies.recursive(
126 |             strategies.characters(),
127 |             lambda children: strategies.lists(children, min_size=1),
128 |         ),
129 |     )
130 | )
131 | def test_gemspec_adversarial(doc):
132 |     parts = [b"Gem::Specification.new do |s|\n"]
133 |     for k, v in doc.items():
134 |         parts.append("  s.{} = {}\n".format(k, repr(v)).encode())
135 |     parts.append(b"end\n")
136 |     MAPPINGS["GemspecMapping"]().translate(b"".join(parts))
137 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from os import path
 7 | 
 8 | import swh.indexer
 9 | 
10 | SQL_DIR = path.join(path.dirname(swh.indexer.__file__), "sql")
11 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2015-2019  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from os.path import join
 7 | 
 8 | import pytest
 9 | 
10 | from swh.indexer.storage import get_indexer_storage
11 | from swh.indexer.storage.model import ContentLicenseRow, ContentMimetypeRow
12 | from swh.indexer.tests.conftest import idx_storage_postgresql
13 | from swh.model.hashutil import hash_to_bytes
14 | 
15 | from . import SQL_DIR
16 | from .generate_data_test import FOSSOLOGY_LICENSES, MIMETYPE_OBJECTS, TOOLS
17 | 
18 | DUMP_FILES = join(SQL_DIR, "*.sql")
19 | 
20 | 
21 | class DataObj(dict):
22 |     def __getattr__(self, key):
23 |         return self.__getitem__(key)
24 | 
25 |     def __setattr__(self, key, value):
26 |         return self.__setitem__(key, value)
27 | 
28 | 
29 | @pytest.fixture
30 | def swh_indexer_storage_with_data(swh_indexer_storage):
31 |     data = DataObj()
32 |     tools = {
33 |         tool["tool_name"]: {
34 |             "id": tool["id"],
35 |             "name": tool["tool_name"],
36 |             "version": tool["tool_version"],
37 |             "configuration": tool["tool_configuration"],
38 |         }
39 |         for tool in swh_indexer_storage.indexer_configuration_add(TOOLS)
40 |     }
41 |     data.tools = tools
42 |     data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
43 |     data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
44 |     data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
45 |     data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
46 |     data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
47 |     data.origin_url_1 = "file:///dev/0/zero"  # 44434341
48 |     data.origin_url_2 = "file:///dev/1/one"  # 44434342
49 |     data.origin_url_3 = "file:///dev/2/two"  # 54974445
50 |     data.mimetypes = [
51 |         ContentMimetypeRow(indexer_configuration_id=tools["file"]["id"], **mimetype_obj)
52 |         for mimetype_obj in MIMETYPE_OBJECTS
53 |     ]
54 |     swh_indexer_storage.content_mimetype_add(data.mimetypes)
55 |     data.fossology_licenses = [
56 |         ContentLicenseRow(
57 |             id=fossology_obj["id"],
58 |             indexer_configuration_id=tools["nomos"]["id"],
59 |             license=license,
60 |         )
61 |         for fossology_obj in FOSSOLOGY_LICENSES
62 |         for license in fossology_obj["licenses"]
63 |     ]
64 |     swh_indexer_storage._test_data = data
65 | 
66 |     return (swh_indexer_storage, data)
67 | 
68 | 
69 | swh_indexer_storage_postgresql = idx_storage_postgresql
70 | 
71 | 
72 | @pytest.fixture
73 | def swh_indexer_storage(swh_indexer_storage_postgresql):
74 |     return get_indexer_storage(
75 |         "postgresql",
76 |         db=swh_indexer_storage_postgresql.info.dsn,
77 |         journal_writer={
78 |             "cls": "memory",
79 |         },
80 |     )
81 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/generate_data_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018-2022  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from uuid import uuid1
  7 | 
  8 | from hypothesis.strategies import composite, one_of, sampled_from, sets, tuples, uuids
  9 | 
 10 | from swh.model.hashutil import MultiHash
 11 | 
 12 | MIMETYPES = [
 13 |     b"application/json",
 14 |     b"application/octet-stream",
 15 |     b"application/xml",
 16 |     b"text/plain",
 17 | ]
 18 | 
 19 | ENCODINGS = [
 20 |     b"iso8859-1",
 21 |     b"iso8859-15",
 22 |     b"latin1",
 23 |     b"utf-8",
 24 | ]
 25 | 
 26 | 
 27 | def gen_mimetype():
 28 |     """Generate one mimetype strategy."""
 29 |     return one_of(sampled_from(MIMETYPES))
 30 | 
 31 | 
 32 | def gen_encoding():
 33 |     """Generate one encoding strategy."""
 34 |     return one_of(sampled_from(ENCODINGS))
 35 | 
 36 | 
 37 | def _init_content(uuid):
 38 |     """Given a uuid, initialize a content"""
 39 |     return {
 40 |         "id": MultiHash.from_data(uuid.bytes, {"sha1"}).digest()["sha1"],
 41 |         "indexer_configuration_id": 1,
 42 |     }
 43 | 
 44 | 
 45 | @composite
 46 | def gen_content_mimetypes(draw, *, min_size=0, max_size=100):
 47 |     """Generate valid and consistent content_mimetypes.
 48 | 
 49 |     Context: Test purposes
 50 | 
 51 |     Args:
 52 |         **draw** (callable): Used by hypothesis to generate data
 53 |         **min_size** (int): Minimal number of elements to generate
 54 |                             (default: 0)
 55 |         **max_size** (int): Maximal number of elements to generate
 56 |                             (default: 100)
 57 | 
 58 |     Returns:
 59 |         List of content_mimetypes as expected by the
 60 |         content_mimetype_add api endpoint.
 61 | 
 62 |     """
 63 |     _ids = draw(
 64 |         sets(
 65 |             tuples(uuids(), gen_mimetype(), gen_encoding()),
 66 |             min_size=min_size,
 67 |             max_size=max_size,
 68 |         )
 69 |     )
 70 | 
 71 |     content_mimetypes = []
 72 |     for uuid, mimetype, encoding in _ids:
 73 |         content_mimetypes.append(
 74 |             {
 75 |                 **_init_content(uuid),
 76 |                 "mimetype": mimetype,
 77 |                 "encoding": encoding,
 78 |             }
 79 |         )
 80 |     return content_mimetypes
 81 | 
 82 | 
 83 | TOOLS = [
 84 |     {
 85 |         "tool_name": "swh-metadata-translator",
 86 |         "tool_version": "0.0.1",
 87 |         "tool_configuration": {"type": "local", "context": "NpmMapping"},
 88 |     },
 89 |     {
 90 |         "tool_name": "swh-metadata-detector",
 91 |         "tool_version": "0.0.1",
 92 |         "tool_configuration": {
 93 |             "type": "local",
 94 |             "context": ["NpmMapping", "CodemetaMapping"],
 95 |         },
 96 |     },
 97 |     {
 98 |         "tool_name": "swh-metadata-detector2",
 99 |         "tool_version": "0.0.1",
100 |         "tool_configuration": {
101 |             "type": "local",
102 |             "context": ["NpmMapping", "CodemetaMapping"],
103 |         },
104 |     },
105 |     {
106 |         "tool_name": "file",
107 |         "tool_version": "5.22",
108 |         "tool_configuration": {"command_line": "file --mime <filepath>"},
109 |     },
110 |     {
111 |         "tool_name": "pygments",
112 |         "tool_version": "2.0.1+dfsg-1.1+deb8u1",
113 |         "tool_configuration": {"type": "library", "debian-package": "python3-pygments"},
114 |     },
115 |     {
116 |         "tool_name": "pygments2",
117 |         "tool_version": "2.0.1+dfsg-1.1+deb8u1",
118 |         "tool_configuration": {
119 |             "type": "library",
120 |             "debian-package": "python3-pygments",
121 |             "max_content_size": 10240,
122 |         },
123 |     },
124 |     {
125 |         "tool_name": "nomos",
126 |         "tool_version": "3.1.0rc2-31-ga2cbb8c",
127 |         "tool_configuration": {"command_line": "nomossa <filepath>"},
128 |     },
129 | ]
130 | 
131 | 
132 | MIMETYPE_OBJECTS = [
133 |     {
134 |         "id": MultiHash.from_data(uuid1().bytes, {"sha1"}).digest()["sha1"],
135 |         "mimetype": mt,
136 |         "encoding": enc,
137 |         # 'indexer_configuration_id' will be added after TOOLS get registered
138 |     }
139 |     for mt in MIMETYPES
140 |     for enc in ENCODINGS
141 | ]
142 | 
143 | LICENSES = [
144 |     b"3DFX",
145 |     b"BSD",
146 |     b"GPL",
147 |     b"Apache2",
148 |     b"MIT",
149 | ]
150 | 
151 | FOSSOLOGY_LICENSES = [
152 |     {
153 |         "id": MultiHash.from_data(uuid1().bytes, {"sha1"}).digest()["sha1"],
154 |         "licenses": [
155 |             LICENSES[i % len(LICENSES)],
156 |         ],
157 |         # 'indexer_configuration_id' will be added after TOOLS get registered
158 |     }
159 |     for i in range(10)
160 | ]
161 | 
162 | 
163 | def gen_license():
164 |     return one_of(sampled_from(LICENSES))
165 | 
166 | 
167 | @composite
168 | def gen_content_fossology_licenses(draw, *, min_size=0, max_size=100):
169 |     """Generate valid and consistent content_fossology_licenses.
170 | 
171 |     Context: Test purposes
172 | 
173 |     Args:
174 |         **draw** (callable): Used by hypothesis to generate data
175 |         **min_size** (int): Minimal number of elements to generate
176 |                             (default: 0)
177 |         **max_size** (int): Maximal number of elements to generate
178 |                             (default: 100)
179 | 
180 |     Returns:
181 |         List of content_fossology_licenses as expected by the
182 |         content_fossology_license_add api endpoint.
183 | 
184 |     """
185 |     _ids = draw(
186 |         sets(
187 |             tuples(
188 |                 uuids(),
189 |                 gen_license(),
190 |             ),
191 |             min_size=min_size,
192 |             max_size=max_size,
193 |         )
194 |     )
195 | 
196 |     content_licenses = []
197 |     for uuid, license in _ids:
198 |         content_licenses.append(
199 |             {
200 |                 **_init_content(uuid),
201 |                 "licenses": [license],
202 |             }
203 |         )
204 |     return content_licenses
205 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/test_api_client.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2015-2023  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | import psycopg
  7 | import pytest
  8 | 
  9 | from swh.core.api import RemoteException, TransientRemoteException
 10 | from swh.indexer.storage import get_indexer_storage
 11 | from swh.indexer.storage.api.client import RemoteStorage
 12 | import swh.indexer.storage.api.server as server
 13 | 
 14 | from .test_storage import *  # noqa
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def app_server(swh_indexer_storage_postgresql):
 19 |     server.storage = get_indexer_storage(
 20 |         "postgresql",
 21 |         db=swh_indexer_storage_postgresql.info.dsn,
 22 |         journal_writer={
 23 |             "cls": "memory",
 24 |         },
 25 |     )
 26 |     yield server
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def app(app_server):
 31 |     return app_server.app
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def swh_rpc_client_class():
 36 |     # these are needed for the swh_indexer_storage_with_data fixture
 37 |     assert hasattr(RemoteStorage, "indexer_configuration_add")
 38 |     assert hasattr(RemoteStorage, "content_mimetype_add")
 39 |     return RemoteStorage
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def swh_indexer_storage(swh_rpc_client, app_server):
 44 |     # This version of the swh_storage fixture uses the swh_rpc_client fixture
 45 |     # to instantiate a RemoteStorage (see swh_rpc_client_class above) that
 46 |     # proxies, via the swh.core RPC mechanism, the local (in memory) storage
 47 |     # configured in the app fixture above.
 48 |     #
 49 |     # Also note that, for the sake of
 50 |     # making it easier to write tests, the in-memory journal writer of the
 51 |     # in-memory backend storage is attached to the RemoteStorage as its
 52 |     # journal_writer attribute.
 53 |     storage = swh_rpc_client
 54 | 
 55 |     journal_writer = getattr(storage, "journal_writer", None)
 56 |     storage.journal_writer = app_server.storage.journal_writer
 57 |     yield storage
 58 |     storage.journal_writer = journal_writer
 59 | 
 60 | 
 61 | def test_exception(app_server, swh_indexer_storage, mocker):
 62 |     """Checks the client re-raises unknown exceptions as a :exc:`RemoteException`"""
 63 |     assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == []
 64 |     mocker.patch.object(
 65 |         app_server.storage,
 66 |         "content_mimetype_get",
 67 |         side_effect=ValueError("crash"),
 68 |     )
 69 |     with pytest.raises(RemoteException) as e:
 70 |         swh_indexer_storage.content_mimetype_get([b"\x01" * 20])
 71 |     assert not isinstance(e, TransientRemoteException)
 72 | 
 73 | 
 74 | def test_operationalerror_exception(app_server, swh_indexer_storage, mocker):
 75 |     """Checks the client re-raises as a :exc:`TransientRemoteException`
 76 |     rather than the base :exc:`RemoteException`; so the retrying proxy
 77 |     retries for longer."""
 78 |     assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == []
 79 |     mocker.patch.object(
 80 |         app_server.storage,
 81 |         "content_mimetype_get",
 82 |         side_effect=psycopg.errors.AdminShutdown("cluster is shutting down"),
 83 |     )
 84 |     with pytest.raises(RemoteException) as excinfo:
 85 |         swh_indexer_storage.content_mimetype_get([b"\x01" * 20])
 86 |     assert isinstance(excinfo.value, TransientRemoteException)
 87 | 
 88 | 
 89 | def test_querycancelled_exception(app_server, swh_indexer_storage, mocker):
 90 |     """Checks the client re-raises as a :exc:`TransientRemoteException`
 91 |     rather than the base :exc:`RemoteException`; so the retrying proxy
 92 |     retries for longer."""
 93 |     assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == []
 94 |     mocker.patch.object(
 95 |         app_server.storage,
 96 |         "content_mimetype_get",
 97 |         side_effect=psycopg.errors.QueryCanceled("too big!"),
 98 |     )
 99 |     with pytest.raises(RemoteException) as excinfo:
100 |         swh_indexer_storage.content_mimetype_get([b"\x01" * 20])
101 |     assert not isinstance(excinfo.value, TransientRemoteException)
102 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/test_converters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2015-2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from swh.indexer.storage import converters
 7 | 
 8 | 
 9 | def test_db_to_mimetype() -> None:
10 |     input_mimetype = {
11 |         "id": b"some-id",
12 |         "tool_id": 10,
13 |         "tool_name": "some-toolname",
14 |         "tool_version": "some-toolversion",
15 |         "tool_configuration": {},
16 |         "encoding": b"ascii",
17 |         "mimetype": b"text/plain",
18 |     }
19 | 
20 |     expected_mimetype = {
21 |         "id": b"some-id",
22 |         "encoding": b"ascii",
23 |         "mimetype": b"text/plain",
24 |         "tool": {
25 |             "id": 10,
26 |             "name": "some-toolname",
27 |             "version": "some-toolversion",
28 |             "configuration": {},
29 |         },
30 |     }
31 | 
32 |     actual_mimetype = converters.db_to_mimetype(input_mimetype)
33 | 
34 |     assert actual_mimetype == expected_mimetype
35 | 
36 | 
37 | def test_db_to_fossology_license() -> None:
38 |     input_license = {
39 |         "id": b"some-id",
40 |         "tool_id": 20,
41 |         "tool_name": "nomossa",
42 |         "tool_version": "5.22",
43 |         "tool_configuration": {},
44 |         "license": "GPL2.0",
45 |     }
46 | 
47 |     expected_license = {
48 |         "id": b"some-id",
49 |         "license": "GPL2.0",
50 |         "tool": {
51 |             "id": 20,
52 |             "name": "nomossa",
53 |             "version": "5.22",
54 |             "configuration": {},
55 |         },
56 |     }
57 | 
58 |     actual_license = converters.db_to_fossology_license(input_license)
59 | 
60 |     assert actual_license == expected_license
61 | 
62 | 
63 | def test_db_to_metadata() -> None:
64 |     input_metadata = {
65 |         "id": b"some-id",
66 |         "tool_id": 20,
67 |         "tool_name": "some-toolname",
68 |         "tool_version": "some-toolversion",
69 |         "tool_configuration": {},
70 |         "metadata": b"metadata",
71 |     }
72 | 
73 |     expected_metadata = {
74 |         "id": b"some-id",
75 |         "metadata": b"metadata",
76 |         "tool": {
77 |             "id": 20,
78 |             "name": "some-toolname",
79 |             "version": "some-toolversion",
80 |             "configuration": {},
81 |         },
82 |     }
83 | 
84 |     actual_metadata = converters.db_to_metadata(input_metadata)
85 | 
86 |     assert actual_metadata == expected_metadata
87 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/test_in_memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2015-2019  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import pytest
 7 | 
 8 | from swh.indexer.storage import get_indexer_storage
 9 | 
10 | from .test_storage import *  # noqa
11 | 
12 | 
13 | @pytest.fixture
14 | def swh_indexer_storage():
15 |     return get_indexer_storage(
16 |         "memory",
17 |         journal_writer={
18 |             "cls": "memory",
19 |         },
20 |     )
21 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/test_metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2019-2020  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | from unittest.mock import patch
 7 | 
 8 | from swh.indexer.storage.metrics import (
 9 |     OPERATIONS_METRIC,
10 |     OPERATIONS_UNIT_METRIC,
11 |     send_metric,
12 | )
13 | 
14 | 
15 | def test_send_metric_unknown_unit() -> None:
16 |     r = send_metric("content", count=10, method_name="content_add")
17 |     assert r is False
18 |     r = send_metric("sthg:add:bytes:extra", count=10, method_name="sthg_add")
19 |     assert r is False
20 | 
21 | 
22 | def test_send_metric_no_value() -> None:
23 |     r = send_metric("content_mimetype:add", count=0, method_name="content_mimetype_add")
24 |     assert r is False
25 | 
26 | 
27 | @patch("swh.indexer.storage.metrics.statsd.increment")
28 | def test_send_metric_no_unit(mock_statsd) -> None:
29 |     r = send_metric(
30 |         "content_mimetype:add", count=10, method_name="content_mimetype_add"
31 |     )
32 | 
33 |     mock_statsd.assert_called_with(
34 |         OPERATIONS_METRIC,
35 |         10,
36 |         tags={
37 |             "endpoint": "content_mimetype_add",
38 |             "object_type": "content_mimetype",
39 |             "operation": "add",
40 |         },
41 |     )
42 | 
43 |     assert r
44 | 
45 | 
46 | @patch("swh.indexer.storage.metrics.statsd.increment")
47 | def test_send_metric_unit(mock_statsd) -> None:
48 |     unit_ = "bytes"
49 |     r = send_metric("c:add:%s" % unit_, count=100, method_name="c_add")
50 | 
51 |     expected_metric = OPERATIONS_UNIT_METRIC.format(unit=unit_)
52 |     mock_statsd.assert_called_with(
53 |         expected_metric,
54 |         100,
55 |         tags={
56 |             "endpoint": "c_add",
57 |             "object_type": "c",
58 |             "operation": "add",
59 |         },
60 |     )
61 | 
62 |     assert r
63 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/test_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020-2022  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import pytest
 7 | 
 8 | from swh.indexer.storage.model import BaseRow, ContentLicenseRow
 9 | 
10 | 
11 | def test_unique_key__no_tool_dict():
12 |     with pytest.raises(ValueError, match="indexer_configuration_id"):
13 |         BaseRow(id=12, indexer_configuration_id=34).unique_key()
14 |     with pytest.raises(ValueError, match="indexer_configuration_id"):
15 |         ContentLicenseRow(
16 |             id=12, indexer_configuration_id=34, license="BSD"
17 |         ).unique_key()
18 | 
19 | 
20 | def test_unique_key():
21 |     assert BaseRow(
22 |         id=12, tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}}
23 |     ).unique_key() == {
24 |         "id": 12,
25 |         "tool_name": "foo",
26 |         "tool_version": "1.2.3",
27 |         "tool_configuration": "{}",
28 |     }
29 | 
30 |     assert ContentLicenseRow(
31 |         id=12,
32 |         tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}},
33 |         license="BSD",
34 |     ).unique_key() == {
35 |         "id": 12,
36 |         "license": "BSD",
37 |         "tool_name": "foo",
38 |         "tool_version": "1.2.3",
39 |         "tool_configuration": "{}",
40 |     }
41 | 
42 |     assert ContentLicenseRow(
43 |         id=12,
44 |         tool={
45 |             "id": 34,
46 |             "name": "foo",
47 |             "version": "1.2.3",
48 |             "configuration": {"foo": 1, "bar": 2},
49 |         },
50 |         license="BSD",
51 |     ).unique_key() == {
52 |         "id": 12,
53 |         "license": "BSD",
54 |         "tool_name": "foo",
55 |         "tool_version": "1.2.3",
56 |         "tool_configuration": '{"bar": 2, "foo": 1}',
57 |     }
58 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/storage/test_server.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2019  The Software Heritage developers
 2 | # See the AUTHORS file at the top-level directory of this distribution
 3 | # License: GNU General Public License version 3, or any later version
 4 | # See top-level LICENSE file for more information
 5 | 
 6 | import pytest
 7 | import yaml
 8 | 
 9 | from swh.indexer.storage.api.server import load_and_check_config
10 | 
11 | 
12 | def prepare_config_file(tmpdir, content, name="config.yml") -> str:
13 |     """Prepare configuration file in `$tmpdir/name` with content `content`.
14 | 
15 |     Args:
16 |         tmpdir (LocalPath): root directory
17 |         content (str/dict): Content of the file either as string or as a dict.
18 |                             If a dict, converts the dict into a yaml string.
19 |         name (str): configuration filename
20 | 
21 |     Returns
22 |         path (str) of the configuration file prepared.
23 | 
24 |     """
25 |     config_path = tmpdir / name
26 |     if isinstance(content, dict):  # convert if needed
27 |         content = yaml.dump(content)
28 |     config_path.write_text(content, encoding="utf-8")
29 |     # pytest on python3.5 does not support LocalPath manipulation, so
30 |     # convert path to string
31 |     return str(config_path)
32 | 
33 | 
34 | @pytest.mark.parametrize("config_path", [None, ""])
35 | def test_load_and_check_config_no_configuration(config_path) -> None:
36 |     """Irrelevant configuration file path raises"""
37 |     with pytest.raises(EnvironmentError, match="Configuration file must be defined"):
38 |         load_and_check_config(config_path)
39 | 
40 | 
41 | def test_load_and_check_inexistent_config_path() -> None:
42 |     """Inexistent configuration file raises"""
43 |     config_path = "/indexer/inexistent/config.yml"
44 |     expected_error = f"Configuration file {config_path} does not exist"
45 |     with pytest.raises(FileNotFoundError, match=expected_error):
46 |         load_and_check_config(config_path)
47 | 
48 | 
49 | def test_load_and_check_config_wrong_configuration(tmpdir) -> None:
50 |     """Wrong configuration raises"""
51 |     config_path = prepare_config_file(tmpdir, "something: useless")
52 |     with pytest.raises(KeyError, match="Missing '%indexer_storage' configuration"):
53 |         load_and_check_config(config_path)
54 | 
55 | 
56 | def test_load_and_check_config_remote_config_fine(tmpdir) -> None:
57 |     """'Remote configuration is fine (when changing the default type)"""
58 |     config = {"indexer_storage": {"cls": "remote"}}
59 |     config_path = prepare_config_file(tmpdir, config)
60 |     cfg = load_and_check_config(config_path)
61 | 
62 |     assert cfg == config
63 | 
64 | 
65 | def test_load_and_check_config_local_config_fine(tmpdir) -> None:
66 |     """'Complete 'postgresql' configuration is fine"""
67 |     config = {
68 |         "indexer_storage": {
69 |             "cls": "postgresql",
70 |             "db": "db",
71 |         }
72 |     }
73 |     config_path = prepare_config_file(tmpdir, config)
74 |     cfg = load_and_check_config(config_path)
75 |     assert cfg == config
76 | 
77 | 
78 | def test_load_and_check_config_deprecated(tmpdir) -> None:
79 |     """'Complete 'local' configuration is fine"""
80 |     config = {
81 |         "indexer.storage": {
82 |             "cls": "postgresql",
83 |             "db": "db",
84 |         }
85 |     }
86 |     config_path = prepare_config_file(tmpdir, config)
87 |     with pytest.warns(DeprecationWarning):
88 |         cfg = load_and_check_config(config_path)
89 |         assert "indexer_storage" in cfg
90 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/test_fossology_license.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2017-2024  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from typing import Any, Dict
  7 | import unittest
  8 | from unittest.mock import patch
  9 | 
 10 | import pytest
 11 | 
 12 | from swh.indexer import fossology_license
 13 | from swh.indexer.fossology_license import FossologyLicenseIndexer, compute_license
 14 | from swh.indexer.storage.model import ContentLicenseRow
 15 | from swh.indexer.tests.utils import (
 16 |     BASE_TEST_CONFIG,
 17 |     RAW_CONTENT_OBJIDS,
 18 |     SHA1_TO_LICENSES,
 19 |     CommonContentIndexerTest,
 20 |     fill_obj_storage,
 21 |     fill_storage,
 22 |     filter_dict,
 23 |     mock_compute_license,
 24 | )
 25 | 
 26 | 
 27 | class BasicTest(unittest.TestCase):
 28 |     @patch("swh.indexer.fossology_license.subprocess")
 29 |     def test_compute_license(self, mock_subprocess):
 30 |         """Computing licenses from a raw content should return results"""
 31 |         for path, intermediary_result, output in [
 32 |             (b"some/path", None, []),
 33 |             (b"some/path/2", [], []),
 34 |             (b"other/path", " contains license(s) GPL,AGPL", ["GPL", "AGPL"]),
 35 |         ]:
 36 |             mock_subprocess.check_output.return_value = intermediary_result
 37 | 
 38 |             actual_result = compute_license(path)
 39 | 
 40 |             self.assertEqual(
 41 |                 actual_result,
 42 |                 {
 43 |                     "licenses": output,
 44 |                     "path": path,
 45 |                 },
 46 |             )
 47 | 
 48 | 
 49 | CONFIG: Dict[str, Any] = {
 50 |     **BASE_TEST_CONFIG,
 51 |     "workdir": "/tmp",
 52 |     "tools": {
 53 |         "name": "nomos",
 54 |         "version": "3.1.0rc2-31-ga2cbb8c",
 55 |         "configuration": {
 56 |             "command_line": "nomossa <filepath>",
 57 |         },
 58 |     },
 59 | }
 60 | 
 61 | RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)])
 62 | 
 63 | 
 64 | class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
 65 |     """Fossology license indexer test scenarios:
 66 | 
 67 |     - Known sha1s in the input list have their data indexed
 68 |     - Unknown sha1 in the input list are not indexed
 69 | 
 70 |     """
 71 | 
 72 |     def get_indexer_results(self, ids):
 73 |         yield from self.idx_storage.content_fossology_license_get(ids)
 74 | 
 75 |     def setUp(self):
 76 |         super().setUp()
 77 |         # replace actual license computation with a mock
 78 |         self.orig_compute_license = fossology_license.compute_license
 79 |         fossology_license.compute_license = mock_compute_license
 80 | 
 81 |         self.indexer = FossologyLicenseIndexer(CONFIG)
 82 |         self.indexer.catch_exceptions = False
 83 |         self.idx_storage = self.indexer.idx_storage
 84 |         fill_storage(self.indexer.storage)
 85 |         fill_obj_storage(self.indexer.objstorage)
 86 | 
 87 |         self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS
 88 | 
 89 |         tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()}
 90 | 
 91 |         # then
 92 |         self.expected_results = [
 93 |             *[
 94 |                 ContentLicenseRow(id=self.id0["sha1"], tool=tool, license=license)
 95 |                 for license in SHA1_TO_LICENSES[self.id0["sha1"]]
 96 |             ],
 97 |             *[
 98 |                 ContentLicenseRow(id=self.id1["sha1"], tool=tool, license=license)
 99 |                 for license in SHA1_TO_LICENSES[self.id1["sha1"]]
100 |             ],
101 |             *[],  # self.id2
102 |         ]
103 | 
104 |     def tearDown(self):
105 |         super().tearDown()
106 |         fossology_license.compute_license = self.orig_compute_license
107 | 
108 | 
109 | def test_fossology_w_no_tool():
110 |     with pytest.raises(ValueError):
111 |         FossologyLicenseIndexer(config=filter_dict(CONFIG, "tools"))
112 | 


--------------------------------------------------------------------------------
/swh/indexer/tests/test_mimetype.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2017-2023  The Software Heritage developers
  2 | # See the AUTHORS file at the top-level directory of this distribution
  3 | # License: GNU General Public License version 3, or any later version
  4 | # See top-level LICENSE file for more information
  5 | 
  6 | from typing import Any, Dict
  7 | import unittest
  8 | 
  9 | import pytest
 10 | 
 11 | from swh.indexer.mimetype import MimetypeIndexer, compute_mimetype_encoding
 12 | from swh.indexer.storage.model import ContentMimetypeRow
 13 | from swh.indexer.tests.utils import (
 14 |     BASE_TEST_CONFIG,
 15 |     RAW_CONTENT_OBJIDS,
 16 |     RAW_CONTENTS,
 17 |     CommonContentIndexerTest,
 18 |     fill_obj_storage,
 19 |     fill_storage,
 20 |     filter_dict,
 21 | )
 22 | 
 23 | 
 24 | @pytest.mark.parametrize(
 25 |     "content_id,raw_text,mimetypes,encoding",
 26 |     RAW_CONTENTS,
 27 | )
 28 | def test_compute_mimetype_encoding(content_id, raw_text, mimetypes, encoding):
 29 |     """Compute mimetype encoding should return results"""
 30 |     actual_result = compute_mimetype_encoding(raw_text)
 31 | 
 32 |     # Older libmagic versions (e.g. buster: 1:5.35-4+deb10u2, bullseye: 1:5.39-3)
 33 |     # returns different results. This allows to deal with such a case when executing
 34 |     # tests on different environments machines (e.g. ci tox, ci debian, dev machine,
 35 |     # ...)
 36 |     all_mimetypes = mimetypes if isinstance(mimetypes, tuple) else [mimetypes]
 37 | 
 38 |     assert actual_result in [
 39 |         {"mimetype": mimetype, "encoding": encoding} for mimetype in all_mimetypes
 40 |     ]
 41 | 
 42 | 
 43 | CONFIG: Dict[str, Any] = {
 44 |     **BASE_TEST_CONFIG,
 45 |     "tools": {
 46 |         "name": "file",
 47 |         "version": "1:5.30-1+deb9u1",
 48 |         "configuration": {"type": "library", "debian-package": "python3-magic"},
 49 |     },
 50 | }
 51 | 
 52 | 
 53 | class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
 54 |     """Mimetype indexer test scenarios:
 55 | 
 56 |     - Known sha1s in the input list have their data indexed
 57 |     - Unknown sha1 in the input list are not indexed
 58 | 
 59 |     """
 60 | 
 61 |     def get_indexer_results(self, ids):
 62 |         yield from self.idx_storage.content_mimetype_get(ids)
 63 | 
 64 |     def setUp(self):
 65 |         self.indexer = MimetypeIndexer(config=CONFIG)
 66 |         self.indexer.catch_exceptions = False
 67 |         self.idx_storage = self.indexer.idx_storage
 68 |         fill_storage(self.indexer.storage)
 69 |         fill_obj_storage(self.indexer.objstorage)
 70 | 
 71 |         self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS
 72 | 
 73 |         tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()}
 74 | 
 75 |         results = []
 76 |         for raw_content_id, raw_content, mimetypes, encoding in RAW_CONTENTS:
 77 |             # Older libmagic versions (e.g. buster: 1:5.35-4+deb10u2, bullseye:
 78 |             # 1:5.39-3) returns different results. This allows to deal with such a case
 79 |             # when executing tests on different environments machines (e.g. ci tox, ci
 80 |             # debian, dev machine, ...)
 81 |             all_mimetypes = mimetypes if isinstance(mimetypes, tuple) else [mimetypes]
 82 | 
 83 |             results.extend(
 84 |                 [
 85 |                     ContentMimetypeRow(
 86 |                         id=raw_content_id["sha1"],
 87 |                         tool=tool,
 88 |                         mimetype=mimetype,
 89 |                         encoding=encoding,
 90 |                     )
 91 |                     for mimetype in all_mimetypes
 92 |                 ]
 93 |             )
 94 | 
 95 |         self.expected_results = results
 96 | 
 97 | 
 98 | RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)])
 99 | 
100 | 
101 | def test_mimetype_w_no_tool():
102 |     with pytest.raises(ValueError):
103 |         MimetypeIndexer(config=filter_dict(CONFIG, "tools"))
104 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | minversion = 4
 3 | envlist =
 4 |   black
 5 |   flake8
 6 |   mypy
 7 |   py3
 8 | 
 9 | [testenv]
10 | usedevelop = true
11 | extras =
12 |   testing
13 | deps =
14 |   pytest-cov
15 | commands =
16 |   pytest --doctest-modules \
17 |   !slow: --hypothesis-profile=fast \
18 |   slow:  --hypothesis-profile=slow \
19 |          --cov=swh/indexer \
20 |          --cov-branch \
21 |          swh/indexer \
22 |          {posargs}
23 | 
24 | [testenv:black]
25 | skip_install = true
26 | deps =
27 |   black==25.1.0
28 | commands =
29 |   {envpython} -m black --check swh
30 | 
31 | [testenv:flake8]
32 | skip_install = true
33 | deps =
34 |   flake8==7.1.1
35 |   flake8-bugbear==24.12.12
36 |   flake8-pyproject==1.2.3
37 |   pycodestyle==2.12.1
38 | 
39 | commands =
40 |   {envpython} -m flake8
41 | 
42 | [testenv:mypy]
43 | extras =
44 |   testing
45 | deps =
46 |   mypy==1.15.0
47 | commands =
48 |   mypy swh
49 | 
50 | # build documentation outside swh-environment using the current
51 | # git HEAD of swh-docs, is executed on CI for each diff to prevent
52 | # breaking doc build
53 | [testenv:sphinx]
54 | allowlist_externals = make
55 | extras =
56 |   testing
57 | deps =
58 |   # fetch and install swh-docs
59 |   git+https://gitlab.softwareheritage.org/swh/devel/swh-docs.git\#egg=swh.docs
60 | setenv =
61 |   SWH_PACKAGE_DOC_TOX_BUILD = 1
62 |   # turn warnings into errors
63 |   SPHINXOPTS = -W
64 | commands =
65 |   make -I {env_dir}/share/swh-docs -C docs
66 | 


--------------------------------------------------------------------------------