├── .copier-answers.yml ├── .git-blame-ignore-revs ├── .gitignore ├── .pre-commit-config.yaml ├── AUTHORS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTORS ├── LICENSE ├── Makefile ├── Makefile.local ├── README.rst ├── codemeta.json ├── conftest.py ├── docs ├── .gitignore ├── Makefile ├── Makefile.local ├── _static │ └── .placeholder ├── _templates │ └── .placeholder ├── cli.rst ├── conf.py ├── images │ ├── .gitignore │ ├── Makefile │ ├── metadata-flow.dot │ ├── tasks-extrinsic-metadata-indexers.uml │ └── tasks-intrinsic-metadata-indexers.uml ├── index.rst ├── metadata-workflow.rst └── swhpkg.rst ├── pyproject.toml ├── requirements-swh.txt ├── requirements-test.txt ├── requirements.txt ├── sql ├── bin │ ├── db-upgrade │ └── dot_add_content ├── doc │ ├── json │ └── sql └── json │ ├── .gitignore │ ├── Makefile │ ├── indexer_configuration.tool_configuration.schema.json │ └── revision_metadata.translated_metadata.json ├── swh └── indexer │ ├── __init__.py │ ├── bibtex.py │ ├── cli.py │ ├── codemeta.py │ ├── data │ ├── Gitea.csv │ ├── codemeta │ │ ├── CITATION │ │ ├── LICENSE │ │ ├── codemeta-2.0.jsonld │ │ ├── codemeta-3.0.jsonld │ │ └── crosswalk.csv │ ├── composer.csv │ ├── nuget.csv │ ├── pubspec.csv │ └── schema.org │ │ ├── CITATION │ │ ├── LICENSE │ │ └── schemaorgcontext.jsonld │ ├── fossology_license.py │ ├── indexer.py │ ├── metadata.py │ ├── metadata_detector.py │ ├── metadata_dictionary │ ├── __init__.py │ ├── base.py │ ├── cff.py │ ├── codemeta.py │ ├── composer.py │ ├── dart.py │ ├── gitea.py │ ├── github.py │ ├── maven.py │ ├── npm.py │ ├── nuget.py │ ├── python.py │ ├── ruby.py │ └── utils.py │ ├── mimetype.py │ ├── namespaces.py │ ├── origin_head.py │ ├── py.typed │ ├── rehash.py │ ├── storage │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── client.py │ │ ├── serializers.py │ │ └── server.py │ ├── converters.py │ ├── db.py │ ├── exc.py │ ├── in_memory.py │ ├── interface.py │ ├── metrics.py │ ├── model.py │ ├── sql │ │ ├── 10-superuser-init.sql │ │ ├── 20-enums.sql │ │ ├── 30-schema.sql │ │ ├── 50-data.sql │ │ ├── 50-func.sql │ │ ├── 60-indexes.sql │ │ └── upgrades │ │ │ ├── 115.sql │ │ │ ├── 116.sql │ │ │ ├── 117.sql │ │ │ ├── 118.sql │ │ │ ├── 119.sql │ │ │ ├── 120.sql │ │ │ ├── 121.sql │ │ │ ├── 122.sql │ │ │ ├── 123.sql │ │ │ ├── 124.sql │ │ │ ├── 125.sql │ │ │ ├── 126.sql │ │ │ ├── 127.sql │ │ │ ├── 128.sql │ │ │ ├── 129.sql │ │ │ ├── 130.sql │ │ │ ├── 131.sql │ │ │ ├── 132.sql │ │ │ ├── 133.sql │ │ │ ├── 134.sql │ │ │ ├── 135.sql │ │ │ ├── 136.sql │ │ │ └── 137.sql │ └── writer.py │ └── tests │ ├── __init__.py │ ├── conftest.py │ ├── metadata_dictionary │ ├── __init__.py │ ├── test_cff.py │ ├── test_codemeta.py │ ├── test_composer.py │ ├── test_dart.py │ ├── test_gitea.py │ ├── test_github.py │ ├── test_maven.py │ ├── test_npm.py │ ├── test_nuget.py │ ├── test_python.py │ └── test_ruby.py │ ├── storage │ ├── __init__.py │ ├── conftest.py │ ├── generate_data_test.py │ ├── test_api_client.py │ ├── test_converters.py │ ├── test_in_memory.py │ ├── test_metrics.py │ ├── test_model.py │ ├── test_server.py │ └── test_storage.py │ ├── test_bibtex.py │ ├── test_cli.py │ ├── test_codemeta.py │ ├── test_fossology_license.py │ ├── test_indexer.py │ ├── test_metadata.py │ ├── test_mimetype.py │ ├── test_origin_head.py │ ├── test_origin_metadata.py │ └── utils.py └── tox.ini /.copier-answers.yml: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier 2 | _commit: v0.3.3 3 | _src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git 4 | description: Software Heritage indexer 5 | distribution_name: swh-indexer 6 | have_cli: true 7 | have_workers: true 8 | package_root: swh/indexer 9 | project_name: swh.indexer 10 | python_minimal_version: '3.7' 11 | readme_format: rst 12 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # python: Reformat code with black 2 | 5aa97ccd6ce29d6f66eb093c5d06e9030d7449fd 3 | 0f847f6119195649fe4108b776b9244940ebdb46 4 | 2e9f1d3e896062ae6b3cd99dc1a5d4148beebbf7 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | *.pyc 3 | .coverage 4 | .eggs/ 5 | .hypothesis 6 | .mypy_cache 7 | .tox 8 | __pycache__ 9 | build/ 10 | dist/ 11 | # these are symlinks created by a hook in swh-docs' main sphinx conf.py 12 | docs/README.rst 13 | docs/README.md 14 | # this should be a symlink for people who want to build the sphinx doc 15 | # without using tox, generally created by the swh-env/bin/update script 16 | docs/Makefile.sphinx 17 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: check-json 7 | - id: check-yaml 8 | 9 | - repo: https://github.com/python/black 10 | rev: 25.1.0 11 | hooks: 12 | - id: black 13 | 14 | - repo: https://github.com/PyCQA/isort 15 | rev: 6.0.0 16 | hooks: 17 | - id: isort 18 | 19 | - repo: https://github.com/pycqa/flake8 20 | rev: 7.1.1 21 | hooks: 22 | - id: flake8 23 | additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject] 24 | 25 | - repo: https://github.com/codespell-project/codespell 26 | rev: v2.4.1 27 | hooks: 28 | - id: codespell 29 | name: Check source code spelling 30 | args: [-L assertIn] 31 | exclude: ^(swh/indexer/data/) 32 | stages: [pre-commit] 33 | - id: codespell 34 | name: Check commit message spelling 35 | stages: [commit-msg] 36 | 37 | - repo: local 38 | hooks: 39 | - id: mypy 40 | name: mypy 41 | entry: mypy 42 | args: [swh] 43 | pass_filenames: false 44 | language: system 45 | types: [python] 46 | - id: twine-check 47 | name: twine check 48 | description: call twine check when pushing an annotated release tag 49 | entry: bash -c "ref=$(git describe) && 50 | [[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] && 51 | (python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true" 52 | pass_filenames: false 53 | stages: [pre-push] 54 | language: python 55 | additional_dependencies: [twine, build] 56 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Copyright (C) 2015-2017 The Software Heritage developers 2 | 3 | See http://www.softwareheritage.org/ for more information. 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Software Heritage Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as Software 6 | Heritage contributors and maintainers pledge to making participation in our 7 | project and our community a harassment-free experience for everyone, regardless 8 | of age, body size, disability, ethnicity, sex characteristics, gender identity 9 | and expression, level of experience, education, socioeconomic status, 10 | nationality, personal appearance, race, religion, or sexual identity and 11 | orientation. 12 | 13 | ## Our Standards 14 | 15 | Examples of behavior that contributes to creating a positive environment 16 | include: 17 | 18 | * Using welcoming and inclusive language 19 | * Being respectful of differing viewpoints and experiences 20 | * Gracefully accepting constructive criticism 21 | * Focusing on what is best for the community 22 | * Showing empathy towards other community members 23 | 24 | Examples of unacceptable behavior by participants include: 25 | 26 | * The use of sexualized language or imagery and unwelcome sexual attention or 27 | advances 28 | * Trolling, insulting/derogatory comments, and personal or political attacks 29 | * Public or private harassment 30 | * Publishing others' private information, such as a physical or electronic 31 | address, without explicit permission 32 | * Other conduct which could reasonably be considered inappropriate in a 33 | professional setting 34 | 35 | ## Our Responsibilities 36 | 37 | Project maintainers are responsible for clarifying the standards of acceptable 38 | behavior and are expected to take appropriate and fair corrective action in 39 | response to any instances of unacceptable behavior. 40 | 41 | Project maintainers have the right and responsibility to remove, edit, or 42 | reject comments, commits, code, wiki edits, issues, and other contributions 43 | that are not aligned to this Code of Conduct, or to ban temporarily or 44 | permanently any contributor for other behaviors that they deem inappropriate, 45 | threatening, offensive, or harmful. 46 | 47 | ## Scope 48 | 49 | This Code of Conduct applies within all project spaces, and it also applies when 50 | an individual is representing the project or its community in public spaces. 51 | Examples of representing a project or community include using an official 52 | project e-mail address, posting via an official social media account, or acting 53 | as an appointed representative at an online or offline event. Representation of 54 | a project may be further defined and clarified by project maintainers. 55 | 56 | ## Enforcement 57 | 58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 59 | reported by contacting the project team at `conduct@softwareheritage.org`. All 60 | complaints will be reviewed and investigated and will result in a response that 61 | is deemed necessary and appropriate to the circumstances. The project team is 62 | obligated to maintain confidentiality with regard to the reporter of an 63 | incident. Further details of specific enforcement policies may be posted 64 | separately. 65 | 66 | Project maintainers who do not follow or enforce the Code of Conduct in good 67 | faith may face temporary or permanent repercussions as determined by other 68 | members of the project's leadership. 69 | 70 | ## Attribution 71 | 72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 73 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 74 | 75 | [homepage]: https://www.contributor-covenant.org 76 | 77 | For answers to common questions about this code of conduct, see 78 | https://www.contributor-covenant.org/faq 79 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | Kumar Shivendu 2 | Siddharth Ravikumar 3 | Thibault Allançon 4 | Satvik Vemuganti 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile driver for SWH Python modules. DO NOT CHANGE. 2 | # You can add custom Makefile rules to Makefile.local 3 | 4 | include ../Makefile.python 5 | -include Makefile.local 6 | -------------------------------------------------------------------------------- /Makefile.local: -------------------------------------------------------------------------------- 1 | TESTFLAGS += --hypothesis-profile=fast 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Software Heritage - Indexer 2 | =========================== 3 | 4 | Tools to compute multiple indexes on SWH's raw contents: 5 | 6 | - content: 7 | 8 | - mimetype 9 | - fossology-license 10 | - metadata 11 | 12 | - origin: 13 | 14 | - metadata (intrinsic, using the content indexer; and extrinsic) 15 | 16 | An indexer is in charge of: 17 | 18 | - looking up objects 19 | - extracting information from those objects 20 | - store those information in the swh-indexer db 21 | 22 | There are multiple indexers working on different object types: 23 | 24 | - content indexer: works with content sha1 hashes 25 | - revision indexer: works with revision sha1 hashes 26 | - origin indexer: works with origin identifiers 27 | 28 | Indexation procedure: 29 | 30 | - receive batch of ids 31 | - retrieve the associated data depending on object type 32 | - compute for that object some index 33 | - store the result to swh's storage 34 | 35 | Current content indexers: 36 | 37 | - mimetype (queue swh_indexer_content_mimetype): detect the encoding 38 | and mimetype 39 | 40 | - fossology-license (queue swh_indexer_fossology_license): compute the 41 | license 42 | 43 | - metadata: translate file from an ecosystem-specific formats to JSON-LD 44 | (using schema.org/CodeMeta vocabulary) 45 | 46 | Current origin indexers: 47 | 48 | - metadata: translate file from an ecosystem-specific formats to JSON-LD 49 | (using schema.org/CodeMeta and ForgeFed vocabularies) 50 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld", 3 | "@type": "SoftwareSourceCode", 4 | "identifier": "5682a72dc61f86ae69f2841c2184d6159c0b6d5d", 5 | "description": "Software Heritage Indexer for revisions and contents", 6 | "name": "swh-indexer", 7 | "isPartOf": { 8 | "@type": "SoftwareSourceCode", 9 | "name": "swh-environment", 10 | "identifier": "83e766feafde91242883be1bf369ed3e6865824f" 11 | }, 12 | "codeRepository": "https://forge.softwareheritage.org/diffusion/78/", 13 | "issueTracker": "https://forge.softwareheritage.org/maniphest/", 14 | "license": "https://spdx.org/licenses/GPL-3.0.html", 15 | "version": "0.0.35", 16 | "author": [ 17 | { 18 | "@type": "Organization", 19 | "name": "Software Heritage", 20 | "url": "https://www.softwareheritage.org", 21 | "email": "swh-devel@inria.fr" 22 | } 23 | ], 24 | "developmentStatus": "active", 25 | "keywords": [ 26 | "indexer", 27 | "software", 28 | "mimetype", 29 | "ctags", 30 | "language", 31 | "fossology-license", 32 | "metadata", 33 | "metadata-detector", 34 | "metadata-translator" 35 | ], 36 | "dateCreated":"2017-06-12", 37 | "datePublished":"2017-06-12", 38 | "programmingLanguage": "Python" 39 | } 40 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020-2025 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from hypothesis import settings 7 | 8 | # define tests profile. Full documentation is at: 9 | # https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles 10 | settings.register_profile("fast", max_examples=5, deadline=5000) 11 | settings.register_profile("slow", max_examples=20, deadline=5000) 12 | 13 | # Ignore the following modules because wsgi module fails as no 14 | # configuration file is found (--doctest-modules forces the module 15 | # loading) 16 | collect_ignore = ["swh/indexer/storage/api/wsgi.py"] 17 | 18 | # we use the various swh fixtures 19 | pytest_plugins = [ 20 | "swh.journal.pytest_plugin", 21 | "swh.storage.pytest_plugin", 22 | ] 23 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | apidoc/ 3 | *-stamp 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | -include Makefile.local 2 | include Makefile.sphinx 3 | -------------------------------------------------------------------------------- /docs/Makefile.local: -------------------------------------------------------------------------------- 1 | sphinx/html: images 2 | sphinx/clean: clean-images 3 | assets: images 4 | 5 | images: 6 | make -C images/ 7 | clean-images: 8 | make -C images/ clean 9 | 10 | .PHONY: images clean-images 11 | 12 | -------------------------------------------------------------------------------- /docs/_static/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/docs/_static/.placeholder -------------------------------------------------------------------------------- /docs/_templates/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/docs/_templates/.placeholder -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | .. _swh-indexer-cli: 2 | 3 | Command-line interface 4 | ====================== 5 | 6 | .. click:: swh.indexer.cli:indexer_cli_group 7 | :prog: swh indexer 8 | :nested: full 9 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | from swh.docs.sphinx.conf import * # NoQA 2 | -------------------------------------------------------------------------------- /docs/images/.gitignore: -------------------------------------------------------------------------------- 1 | *.svg 2 | -------------------------------------------------------------------------------- /docs/images/Makefile: -------------------------------------------------------------------------------- 1 | 2 | UML_DIAGS_SRC = $(wildcard *.uml) 3 | UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC)) 4 | 5 | DOT_DIAGS_SRC = $(wildcard *.dot) 6 | DOT_DIAGS = $(patsubst %.dot,%.svg,$(DOT_DIAGS_SRC)) 7 | 8 | all: $(UML_DIAGS) $(DOT_DIAGS) 9 | 10 | %.svg: %.uml 11 | DISPLAY="" plantuml -tsvg $< 12 | 13 | %.svg: %.dot 14 | dot $< -T svg -o $@ 15 | 16 | clean: 17 | -rm -f $(DEP_GRAPHS) $(UML_DIAGS) $(DOT_DIAGS) 18 | -------------------------------------------------------------------------------- /docs/images/metadata-flow.dot: -------------------------------------------------------------------------------- 1 | digraph metadata_flow { 2 | subgraph cluster_forges { 3 | style=invis; 4 | origin_vcs [label="Version Control Systems\n(Git, SVN, ...)"]; 5 | origin_pm [label="Package Managers\n(NPM, PyPI, Debian, ...)"]; 6 | } 7 | subgraph internet { 8 | rank=same; 9 | deposit_client [label="Deposit Clients\n(HAL, IPOL, eLife, Intel, ...)"]; 10 | registries [label="Registries\n(Wikidata, ...)"]; 11 | } 12 | 13 | subgraph cluster_SWH { 14 | label="Software Heritage"; 15 | labeljust="r"; 16 | labelloc="b"; 17 | loader_vcs [label="VCS loader", shape="box"]; 18 | loader_pm [label="PM loader", shape="box"]; 19 | deposit_server [label="Deposit server", shape="box"]; 20 | indexer_extr [label="extrinsic metadata indexer\n(translate to Codemeta)", shape="box"]; 21 | indexer_intr [label="intrinsic metadata indexer\n(translate to Codemeta)", shape="box"]; 22 | registry_fetcher[label="?", style="dashed", shape="box"]; 23 | 24 | storage [label="\nMain Storage\n(swh-storage and\nswh-objstorage)", shape=cylinder]; 25 | remd_storage [label="\nRaw Extrinsic\nMetadata Storage", shape=cylinder]; 26 | indexed_storage [label="\nIndexed\nMetadata Storage\n(search, idx-storage)", shape=cylinder]; 27 | 28 | webapp [label="Web Interface", shape="box"]; 29 | } 30 | 31 | subgraph users { 32 | browser [label="Web Browser", shape="box"] 33 | } 34 | 35 | origin_vcs -> loader_vcs [label="pull"]; 36 | loader_vcs -> storage; 37 | origin_pm -> loader_pm [label="pull"] 38 | loader_pm -> {storage, remd_storage}; 39 | deposit_client -> deposit_server [label="push\n(SWORD + Codemeta)"]; 40 | deposit_server -> {storage, remd_storage}; 41 | 42 | registries -> registry_fetcher -> remd_storage [style="dashed"]; 43 | 44 | storage -> indexer_intr [label="all kinds of\nmetadata formats"]; 45 | indexer_intr -> indexed_storage [label="only Codemeta"]; 46 | remd_storage -> indexer_extr [label="all kinds of\nmetadata formats"]; 47 | indexer_extr-> indexed_storage; 48 | 49 | {storage, remd_storage, indexed_storage} -> webapp; 50 | webapp -> browser [label="search, display,\nBibTeX export,\ndownload, ..."]; 51 | } 52 | -------------------------------------------------------------------------------- /docs/images/tasks-extrinsic-metadata-indexers.uml: -------------------------------------------------------------------------------- 1 | @startuml 2 | participant LOADERS as "Metadata Loaders" 3 | participant STORAGE as "Graph Storage" 4 | participant JOURNAL as "Journal" 5 | participant IDX_REM_META as "REM Indexer" 6 | participant IDX_STORAGE as "Indexer Storage" 7 | 8 | activate IDX_STORAGE 9 | activate STORAGE 10 | activate JOURNAL 11 | activate LOADERS 12 | 13 | LOADERS->>STORAGE: new REM (Raw Extrinsic Metadata) object\n for Origin http://example.org/repo.git\nor object swh:1:dir:... 14 | STORAGE->>JOURNAL: new REM object 15 | deactivate LOADERS 16 | 17 | JOURNAL->>IDX_REM_META: run indexers on REM object 18 | activate IDX_REM_META 19 | 20 | IDX_REM_META->>IDX_REM_META: recognize REM object (gitea/github/deposit/...) 21 | 22 | IDX_REM_META->>IDX_REM_META: parse REM object 23 | 24 | alt If the REM object describe an origin 25 | IDX_REM_META->>IDX_STORAGE: origin_extrinsic_metadata_add(id="http://example.org/repo.git", {author: "Jane Doe", ...}) 26 | IDX_STORAGE->>IDX_REM_META: ok 27 | end 28 | 29 | alt If the REM object describe a directory 30 | IDX_REM_META->>IDX_STORAGE: directory_extrinsic_metadata_add(id="swh:1:dir:...", {author: "Jane Doe", ...}) 31 | IDX_STORAGE->>IDX_REM_META: ok 32 | end 33 | 34 | deactivate IDX_REM_META 35 | 36 | 37 | @enduml 38 | -------------------------------------------------------------------------------- /docs/images/tasks-intrinsic-metadata-indexers.uml: -------------------------------------------------------------------------------- 1 | @startuml 2 | participant LOADERS as "Loaders" 3 | participant STORAGE as "Graph Storage" 4 | participant JOURNAL as "Journal" 5 | participant IDX_ORIG_META as "Origin Metadata Indexer" 6 | participant IDX_ORIG_HEAD as "Origin-Head Indexer" 7 | participant IDX_DIR_META as "Directory Metadata Indexer" 8 | participant IDX_CONT_META as "Content Metadata Indexer" 9 | participant IDX_STORAGE as "Indexer Storage" 10 | participant OBJ_STORAGE as "Object Storage" 11 | 12 | activate OBJ_STORAGE 13 | activate IDX_STORAGE 14 | activate STORAGE 15 | activate JOURNAL 16 | activate IDX_ORIG_META 17 | 18 | activate LOADERS 19 | 20 | LOADERS->>STORAGE: Repository content 21 | LOADERS->>STORAGE: Origin http://example.org/repo.git\nwas added/revisited 22 | STORAGE->>JOURNAL: Origin http://example.org/repo.git\nwas added/revisited 23 | deactivate LOADERS 24 | 25 | JOURNAL->>IDX_ORIG_META: run indexers on origin\nhttp://example.org/repo.git 26 | 27 | IDX_ORIG_META->>IDX_ORIG_HEAD: Find HEAD revision of\nhttp://example.org/repo.git 28 | activate IDX_ORIG_HEAD 29 | 30 | IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin="http://example.org/repo.git") 31 | 32 | STORAGE->>IDX_ORIG_HEAD: branches 33 | 34 | IDX_ORIG_HEAD->>IDX_ORIG_META: run Revision Metadata Indexer\non revision 42abcdef (head of origin\nhttp://example.org/repo.git) 35 | deactivate IDX_ORIG_HEAD 36 | 37 | IDX_ORIG_META->>STORAGE: revision_get(sha1=42abcdef) 38 | STORAGE->>IDX_ORIG_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...} 39 | 40 | IDX_ORIG_META->>IDX_DIR_META: Index directory 456789ab\n(head of origin http://example.org/repo.git) 41 | activate IDX_DIR_META 42 | 43 | IDX_DIR_META->>STORAGE: directory_ls(sha1=456789ab) 44 | STORAGE->>IDX_DIR_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...] 45 | 46 | IDX_DIR_META->>IDX_DIR_META: package.json is a metadata file 47 | 48 | IDX_DIR_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe) 49 | IDX_STORAGE->>IDX_DIR_META: none / {author: "Jane Doe", ...} 50 | 51 | alt If the storage answered "none" 52 | IDX_DIR_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file 53 | activate IDX_CONT_META 54 | 55 | IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe 56 | 57 | OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...' 58 | 59 | IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author 60 | 61 | IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...}) 62 | IDX_STORAGE->>IDX_CONT_META: ok 63 | 64 | IDX_CONT_META->>IDX_DIR_META: extracted: {author: "Jane Doe", ...} 65 | deactivate IDX_CONT_META 66 | 67 | IDX_DIR_META->>IDX_STORAGE: directory_metadata_add(sha1=456789ab, {author: "Jane Doe", ...}) 68 | IDX_STORAGE->>IDX_DIR_META: ok 69 | end 70 | 71 | IDX_DIR_META->>IDX_ORIG_META: extracted: {author: "Jane Doe", ...} 72 | deactivate IDX_DIR_META 73 | 74 | IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id="http://example.org/repo.git", {author: "Jane Doe", ...}, from_directory=456789ab) 75 | IDX_STORAGE->>IDX_ORIG_META: ok 76 | deactivate IDX_ORIG_META 77 | 78 | 79 | @enduml 80 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _swh-indexer: 2 | 3 | .. include:: README.rst 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | :caption: Contents: 8 | 9 | README.md 10 | metadata-workflow.rst 11 | swhpkg.rst 12 | 13 | 14 | Reference Documentation 15 | ----------------------- 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | 20 | cli 21 | 22 | .. only:: standalone_package_doc 23 | 24 | Indices and tables 25 | ------------------ 26 | 27 | * :ref:`genindex` 28 | * :ref:`modindex` 29 | * :ref:`search` 30 | -------------------------------------------------------------------------------- /docs/swhpkg.rst: -------------------------------------------------------------------------------- 1 | SwhPkg Vocabulary 2 | ================================ 3 | 4 | .. note:: This is an early draft and hasn't been implemented yet 5 | 6 | 7 | SwhPkg is a vocabulary that complements ontologies like schema.org and CodeMeta 8 | in describing software projects. While the latter are meant to describe 9 | source code projects, SwhPkg describes relationships between different packages released 10 | by such projects. 11 | 12 | The namespace is ``https://www.softwareheritage.org/schema/2023/packages/``; 13 | and it is meant to be used primarily alongside CodeMeta/schema.org 14 | and ForgeFed/ActivityStreams. 15 | 16 | 17 | The following prefixes are used throughout this document for readability: 18 | 19 | .. code-block:: json 20 | 21 | { 22 | "schema": "http://schema.org/", 23 | "codemeta": "https://codemeta.github.io/terms/", 24 | "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/", 25 | "swhpackages": "https://archive.softwareheritage.org/packages/", 26 | } 27 | 28 | For example, here is a document using all three together: 29 | 30 | .. code-block:: json 31 | 32 | { 33 | "@context": { 34 | "schema": "http://schema.org/", 35 | "codemeta": "https://codemeta.github.io/terms/", 36 | "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/", 37 | "swhpackages": "https://archive.softwareheritage.org/packages/", 38 | "package": {"@id": "swhpkg:package", "@type": "@id"}, 39 | "release": {"@id": "swhpkg:release", "@type": "@id"}, 40 | "dependencies": {"@id": "swhpkg:dependencies"}, 41 | "dependency": {"@id": "swhpkg:dependency", "@type": "@id"}, 42 | "dependent": {"@id": "swhpkg:dependent", "@type": "@id"}, 43 | "kind": {"@id": "swhpkg:kind"}, 44 | "optional": {"@id": "swhpkg:optional"} 45 | }, 46 | "@type": "schema:SoftwareSourceCode", 47 | "@id": "https://npmjs.com/package/d3@7.8.2", 48 | "package": "swhpackages:js/d3", 49 | "release": "swhpackages:js/d3@7.8.2", 50 | "schema:name": "d3", 51 | "schema:version": "7.8.2", 52 | "schema:description": "Data-Driven Documents", 53 | "dependencies": [ 54 | { 55 | "@type": "swhpkg:dependencies", 56 | "@id": "swhpackages:js/d3@7.8.2#d3-array", 57 | "dependent": "swhpackages:js/d3@7.8.2", 58 | "dependency": "swhpackages:js/d3-array", 59 | "constraint": "^3.0.0", 60 | "kind": "runtime", 61 | "optional": false 62 | }, 63 | { 64 | "@type": "swhpkg:dependencies", 65 | "@id": "swhpackages:js/d3@7.8.2#mocha", 66 | "dependent": "swhpackages:js/d3@7.8.2", 67 | "dependency": "swhpackages:js/mocha", 68 | "constraint": ">10.0.0", 69 | "kind": "development", 70 | "optional": true 71 | } 72 | ] 73 | } 74 | 75 | SwhPkg Terms 76 | ------------- 77 | 78 | .. list-table:: 79 | :header-rows: 1 80 | 81 | * - Property 82 | - Type 83 | - Examples 84 | - Description 85 | * - ``package`` 86 | - ``swhpkg:package`` 87 | - ``swhpackages:js/d3``, ``swhpackages:python/numpy`` 88 | - Package that is released by the SoftwareSourceCode/SofwtareApplication. 89 | * - ``release`` 90 | - ``swhpkg:release`` 91 | - ``swhpackages:js/d3@7.8.2``, ``swhpackages:python/numpy@1.24.2`` 92 | - Specific version of the package that is released by the SoftwareSourceCode/SoftwareApplication 93 | * - ``dependencies`` 94 | - ``swhpkg:dependencies`` 95 | - d3 depends on d3-array and mocha. 96 | - Dependencies of the project. There can be many of them. 97 | * - ``dependent`` 98 | - ``swhpkg:release`` 99 | - ``swhpkg:js/d3`` 100 | - A reference to the package release that depends on the dependency. 101 | * - ``dependency`` 102 | - ``swhpkg:package`` 103 | - ``swhpackages:js/d3``, ``swhpackages:python/django`` 104 | - A reference to the package that is depended on. 105 | * - ``constraint`` 106 | - Text 107 | - ``^3.0.0``, ``>10.0.0`` 108 | - The constraint on a dependency relation. It can be a version range, or a git commit hash, or even a file path. 109 | * - ``kind`` 110 | - Text 111 | - ``runtime``, ``development`` 112 | - The type of dependency relation. Some common values are ``runtime``, ``development``. 113 | * - ``optional`` 114 | - boolean 115 | - ``true``, ``false`` 116 | - Whether the dependency is optional or not. 117 | 118 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "swh.indexer" 3 | authors = [ 4 | {name="Software Heritage developers", email="swh-devel@inria.fr"}, 5 | ] 6 | 7 | description = "Software Heritage indexer" 8 | readme = {file = "README.rst", content-type = "text/x-rst"} 9 | requires-python = ">=3.9" 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "Intended Audience :: Developers", 13 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 14 | "Operating System :: OS Independent", 15 | "Development Status :: 5 - Production/Stable", 16 | ] 17 | dynamic = ["version", "dependencies", "optional-dependencies"] 18 | 19 | [tool.setuptools.packages.find] 20 | include = ["swh.*"] 21 | 22 | [tool.setuptools.dynamic] 23 | dependencies = {file = ["requirements.txt", "requirements-swh.txt"]} 24 | 25 | [tool.setuptools.dynamic.optional-dependencies] 26 | testing = {file = ["requirements-test.txt"]} 27 | 28 | [project.entry-points."swh.cli.subcommands"] 29 | "swh.indexer" = "swh.indexer.cli" 30 | 31 | [project.entry-points."swh.indexer_storage.classes"] 32 | "postgresql" = "swh.indexer.storage:IndexerStorage" 33 | "remote" = "swh.indexer.storage.api.client:RemoteStorage" 34 | "memory" = "swh.indexer.storage.in_memory:IndexerStorage" 35 | 36 | [project.urls] 37 | "Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-indexer" 38 | "Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-indexer/-/issues" 39 | "Funding" = "https://www.softwareheritage.org/donate" 40 | "Documentation" = "https://docs.softwareheritage.org/devel/swh-indexer/" 41 | "Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-indexer.git" 42 | 43 | [build-system] 44 | requires = ["setuptools", "setuptools-scm"] 45 | build-backend = "setuptools.build_meta" 46 | 47 | [tool.setuptools_scm] 48 | fallback_version = "0.0.1" 49 | 50 | [tool.black] 51 | target-version = ['py39', 'py310', 'py311', 'py312'] 52 | 53 | [tool.isort] 54 | multi_line_output = 3 55 | include_trailing_comma = true 56 | force_grid_wrap = 0 57 | use_parentheses = true 58 | ensure_newline_before_comments = true 59 | line_length = 88 60 | force_sort_within_sections = true 61 | known_first_party = ['swh'] 62 | 63 | [tool.mypy] 64 | namespace_packages = true 65 | warn_unused_ignores = true 66 | explicit_package_bases = true 67 | # ^ Needed for mypy to detect py.typed from swh packages installed 68 | # in editable mode 69 | 70 | plugins = [] 71 | 72 | # 3rd party libraries without stubs (yet) 73 | [[tool.mypy.overrides]] 74 | module = [ 75 | "pybtex.*", 76 | "pyld.*", 77 | ] 78 | ignore_missing_imports = true 79 | 80 | [tool.flake8] 81 | select = ["C", "E", "F", "W", "B950"] 82 | ignore = [ 83 | "E203", # whitespaces before ':' 84 | "E231", # missing whitespace after ',' 85 | "E501", # line too long, use B950 warning from flake8-bugbear instead 86 | "W503" # line break before binary operator 87 | ] 88 | max-line-length = 88 89 | 90 | [tool.pytest.ini_options] 91 | norecursedirs = "build docs .*" 92 | asyncio_mode = "strict" 93 | consider_namespace_packages = true 94 | -------------------------------------------------------------------------------- /requirements-swh.txt: -------------------------------------------------------------------------------- 1 | swh.core[db,http] >= 4.0.0 2 | swh.model >= 6.13.0 3 | swh.objstorage >= 2.3.1 4 | swh.storage >= 3.0.0 5 | swh.journal >= 0.1.0 6 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | confluent-kafka 2 | hypothesis >= 3.11.0 3 | pytest >= 8.1 4 | pytest-mock 5 | swh.core[testing] >= 3.0.0 6 | swh.journal[pytest] >= 2.0.0 7 | swh.storage[pytest] >= 3.1.0 8 | 9 | types-click 10 | types-confluent-kafka 11 | types-pyyaml 12 | types-xmltodict 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-magic >= 0.4.13 2 | click 3 | # frozendict: dependency of pyld 4 | # the version 2.1.2 is causing segmentation faults 5 | # cf https://forge.softwareheritage.org/T3815 6 | frozendict != 2.1.2 7 | iso8601 8 | # use upstream pybtex that removed pkg_resources use until a new release 9 | pybtex @ git+https://bitbucket.org/pybtex-devs/pybtex.git@9b97822 10 | pyld 11 | rdflib >= 7.1.4 # first version with this patch: https://github.com/RDFLib/rdflib/pull/3011 12 | sentry-sdk 13 | typing-extensions 14 | xmltodict 15 | -------------------------------------------------------------------------------- /sql/bin/db-upgrade: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute a draft upgrade script for the DB schema, based on Git revisions. 4 | 5 | # Depends: apgdiff 6 | 7 | set -e 8 | 9 | SQLS="swh-*.sql" 10 | VERSION_SQL="swh-schema.sql" 11 | UPGRADE_DIR="upgrades" 12 | DB_NAME="softwareheritage-dev" 13 | 14 | usage () { 15 | echo "Usage: db-upgrade GIT_REV_FROM [GIT_REV_TO]" 16 | echo "Example: db-upgrade HEAD^" 17 | echo " db-upgrade HEAD~4 HEAD~2" 18 | echo "See also: gitrevisions(7)" 19 | exit 1 20 | } 21 | 22 | pg_dump_revision () { 23 | rev="$1" 24 | dump="$2" 25 | 26 | echo "checking out revision $rev, and dumping DB at the time..." 27 | if [ "$rev" != "HEAD" ] ; then 28 | git checkout --quiet "$rev" 29 | fi 30 | make distclean filldb > /dev/null 31 | pg_dump "$DB_NAME" > "$dump" 32 | if [ "$rev" != "HEAD" ] ; then 33 | git checkout --quiet - 34 | fi 35 | } 36 | 37 | # argument parsing 38 | if [ -z "$1" ] ; then 39 | usage 40 | fi 41 | from_rev="$1" 42 | shift 1 43 | if [ -z "$1" ] ; then 44 | to_rev="HEAD" 45 | else 46 | to_rev="$1" 47 | shift 1 48 | fi 49 | 50 | old_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX) 51 | new_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX) 52 | trap "rm -f $old_dump $new_dump" EXIT 53 | 54 | schema_version=$(grep -i -A 1 '^insert into dbversion' "$VERSION_SQL" | tail -n 1 \ 55 | | sed -e 's/.*values(//i' -e 's/,.*//') 56 | upgrade_script=$(mktemp -p "$UPGRADE_DIR" $(printf '%.03d' ${schema_version}).XXXX.sql) 57 | pg_dump_revision "$from_rev" "$old_dump" 58 | pg_dump_revision "$to_rev" "$new_dump" 59 | 60 | cat > "$upgrade_script" <> "$upgrade_script" 71 | 72 | echo "all done." 73 | echo "Draft upgrade script is at: ${upgrade_script}" 74 | -------------------------------------------------------------------------------- /sql/bin/dot_add_content: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOT_FILE="$1" 4 | DOT_EXTRA="$2" 5 | if [ -z "$DOT_FILE" -o -z "$DOT_EXTRA" ] ; then 6 | echo "Usage: $0 DOT_FILE DOT_EXTRA" 7 | exit 1 8 | fi 9 | 10 | schema_version=$(grep -i -A 1 '^insert into dbversion' swh-schema.sql | tail -n 1 \ 11 | | sed -e 's/.*values(//i' -e 's/,.*//') 12 | 13 | head -n -1 "$DOT_FILE" # all of $DOT_FILE but last line 14 | sed "s/@@VERSION@@/$schema_version/" "$DOT_EXTRA" 15 | echo "}" 16 | -------------------------------------------------------------------------------- /sql/doc/json: -------------------------------------------------------------------------------- 1 | ../json -------------------------------------------------------------------------------- /sql/doc/sql: -------------------------------------------------------------------------------- 1 | ../autodoc -------------------------------------------------------------------------------- /sql/json/.gitignore: -------------------------------------------------------------------------------- 1 | *-stamp 2 | -------------------------------------------------------------------------------- /sql/json/Makefile: -------------------------------------------------------------------------------- 1 | # Depends: json-glib-tools 2 | 3 | JSONVAL = json-glib-validate 4 | JSONS = $(wildcard *.json) 5 | 6 | all: validate 7 | check: validate 8 | test: validate 9 | 10 | validate: validate-stamp 11 | validate-stamp: $(JSONS) 12 | make $(patsubst %,validate/%,$?) 13 | touch $@ 14 | 15 | validate/%: 16 | $(JSONVAL) $* 17 | 18 | clean: 19 | rm -f validate-stamp 20 | -------------------------------------------------------------------------------- /sql/json/indexer_configuration.tool_configuration.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "id": "http://softwareheritage.org/schemas/indexer_configuration.tool_configuration.schema.json", 4 | 5 | "type": "object", 6 | "properties": { 7 | "command_line": { 8 | "type": "string" 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /sql/json/revision_metadata.translated_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json", 4 | 5 | "type": "object", 6 | "properties": { 7 | "developmentStatus": { 8 | "type": "list" 9 | }, 10 | "version": { 11 | "type": "list" 12 | }, 13 | "operatingSystem": { 14 | "type": "list" 15 | }, 16 | "description": { 17 | "type": "list" 18 | }, 19 | "keywords": { 20 | "type": "list" 21 | }, 22 | "issueTracker": { 23 | "type": "list" 24 | }, 25 | "name": { 26 | "type": "list" 27 | }, 28 | "author": { 29 | "type": "list" 30 | }, 31 | "relatedLink": { 32 | "type": "list" 33 | }, 34 | "url": { 35 | "type": "list" 36 | }, 37 | "license": { 38 | "type": "list" 39 | }, 40 | "maintainer": { 41 | "type": "list" 42 | }, 43 | "email": { 44 | "type": "list" 45 | }, 46 | "softwareRequirements": { 47 | "type": "list" 48 | }, 49 | "identifier": { 50 | "type": "list" 51 | }, 52 | "codeRepository": { 53 | "type": "list" 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /swh/indexer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2016-2023 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | 7 | # implemented as a function to help lazy loading 8 | def get_datastore(*args, **kw): 9 | from .indexer import get_indexer_storage 10 | 11 | return get_indexer_storage(*args, **kw) 12 | 13 | 14 | default_cfg = { 15 | "default_interval": "1 day", 16 | "min_interval": "12 hours", 17 | "max_interval": "1 day", 18 | "backoff_factor": 2, 19 | "max_queue_length": 5000, 20 | } 21 | -------------------------------------------------------------------------------- /swh/indexer/data/Gitea.csv: -------------------------------------------------------------------------------- 1 | Property,Gitea 2 | codeRepository,clone_url 3 | programmingLanguage,languages 4 | runtimePlatform, 5 | targetProduct, 6 | applicationCategory, 7 | applicationSubCategory, 8 | downloadUrl, 9 | fileSize, 10 | installUrl, 11 | memoryRequirements, 12 | operatingSystem, 13 | permissions, 14 | processorRequirements, 15 | releaseNotes, 16 | softwareHelp, 17 | softwareRequirements, 18 | softwareVersion, 19 | storageRequirements, 20 | supportingData, 21 | author,owner 22 | citation, 23 | contributor, 24 | copyrightHolder, 25 | copyrightYear, 26 | dateCreated,created_at 27 | dateModified,updated_at 28 | datePublished, 29 | editor, 30 | encoding, 31 | fileFormat, 32 | funder, 33 | keywords, 34 | license, 35 | producer, 36 | provider, 37 | publisher, 38 | sponsor, 39 | version, 40 | isAccessibleForFree, 41 | isPartOf, 42 | hasPart, 43 | position, 44 | description,description 45 | identifier, 46 | name,name 47 | sameAs, 48 | url,website 49 | relatedLink, 50 | givenName, 51 | familyName, 52 | email, 53 | affiliation, 54 | identifier, 55 | name,name 56 | address, 57 | type, 58 | id, 59 | softwareSuggestions, 60 | maintainer, 61 | contIntegration, 62 | buildInstructions, 63 | developmentStatus, 64 | embargoDate, 65 | funding, 66 | issueTracker, 67 | referencePublication, 68 | readme, 69 | -------------------------------------------------------------------------------- /swh/indexer/data/codemeta/CITATION: -------------------------------------------------------------------------------- 1 | Matthew B. Jones, Carl Boettiger, Abby Cabunoc Mayes, Arfon Smith, Peter Slaughter, Kyle Niemeyer, Yolanda Gil, Martin Fenner, Krzysztof Nowak, Mark Hahnel, Luke Coy, Alice Allen, Mercè Crosas, Ashley Sands, Neil Chue Hong, Patricia Cruse, Daniel S. Katz, Carole Goble. 2017. CodeMeta: an exchange schema for software metadata. Version 2.0. KNB Data Repository. doi:10.5063/schema/codemeta-2.0 2 | swh:1:dir:f39a0ef0005ad0dee50dcd546231ed568cf8705d;origin=https://github.com/codemeta/codemeta 3 | -------------------------------------------------------------------------------- /swh/indexer/data/codemeta/codemeta-2.0.jsonld: -------------------------------------------------------------------------------- 1 | { 2 | "@context": { 3 | "type": "@type", 4 | "id": "@id", 5 | "schema":"http://schema.org/", 6 | "codemeta": "https://codemeta.github.io/terms/", 7 | "Organization": {"@id": "schema:Organization"}, 8 | "Person": {"@id": "schema:Person"}, 9 | "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"}, 10 | "SoftwareApplication": {"@id": "schema:SoftwareApplication"}, 11 | "Text": {"@id": "schema:Text"}, 12 | "URL": {"@id": "schema:URL"}, 13 | "address": { "@id": "schema:address"}, 14 | "affiliation": { "@id": "schema:affiliation"}, 15 | "applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"}, 16 | "applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"}, 17 | "citation": { "@id": "schema:citation"}, 18 | "codeRepository": { "@id": "schema:codeRepository", "@type": "@id"}, 19 | "contributor": { "@id": "schema:contributor"}, 20 | "copyrightHolder": { "@id": "schema:copyrightHolder"}, 21 | "copyrightYear": { "@id": "schema:copyrightYear"}, 22 | "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" }, 23 | "dateModified": {"@id": "schema:dateModified", "@type": "schema:Date" }, 24 | "datePublished": {"@id": "schema:datePublished", "@type": "schema:Date" }, 25 | "description": { "@id": "schema:description"}, 26 | "downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"}, 27 | "email": { "@id": "schema:email"}, 28 | "editor": { "@id": "schema:editor"}, 29 | "encoding": { "@id": "schema:encoding"}, 30 | "familyName": { "@id": "schema:familyName"}, 31 | "fileFormat": { "@id": "schema:fileFormat", "@type": "@id"}, 32 | "fileSize": { "@id": "schema:fileSize"}, 33 | "funder": { "@id": "schema:funder"}, 34 | "givenName": { "@id": "schema:givenName"}, 35 | "hasPart": { "@id": "schema:hasPart" }, 36 | "identifier": { "@id": "schema:identifier", "@type": "@id"}, 37 | "installUrl": { "@id": "schema:installUrl", "@type": "@id"}, 38 | "isAccessibleForFree": { "@id": "schema:isAccessibleForFree"}, 39 | "isPartOf": { "@id": "schema:isPartOf"}, 40 | "keywords": { "@id": "schema:keywords"}, 41 | "license": { "@id": "schema:license", "@type": "@id"}, 42 | "memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"}, 43 | "name": { "@id": "schema:name"}, 44 | "operatingSystem": { "@id": "schema:operatingSystem"}, 45 | "permissions": { "@id": "schema:permissions"}, 46 | "position": { "@id": "schema:position"}, 47 | "processorRequirements": { "@id": "schema:processorRequirements"}, 48 | "producer": { "@id": "schema:producer"}, 49 | "programmingLanguage": { "@id": "schema:programmingLanguage"}, 50 | "provider": { "@id": "schema:provider"}, 51 | "publisher": { "@id": "schema:publisher"}, 52 | "relatedLink": { "@id": "schema:relatedLink", "@type": "@id"}, 53 | "releaseNotes": { "@id": "schema:releaseNotes", "@type": "@id"}, 54 | "runtimePlatform": { "@id": "schema:runtimePlatform"}, 55 | "sameAs": { "@id": "schema:sameAs", "@type": "@id"}, 56 | "softwareHelp": { "@id": "schema:softwareHelp"}, 57 | "softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"}, 58 | "softwareVersion": { "@id": "schema:softwareVersion"}, 59 | "sponsor": { "@id": "schema:sponsor"}, 60 | "storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"}, 61 | "supportingData": { "@id": "schema:supportingData"}, 62 | "targetProduct": { "@id": "schema:targetProduct"}, 63 | "url": { "@id": "schema:url", "@type": "@id"}, 64 | "version": { "@id": "schema:version"}, 65 | 66 | "author": { "@id": "schema:author", "@container": "@list" }, 67 | 68 | "softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"}, 69 | "contIntegration": { "@id": "codemeta:contIntegration", "@type": "@id"}, 70 | "buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"}, 71 | "developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"}, 72 | "embargoDate": { "@id":"codemeta:embargoDate", "@type": "schema:Date" }, 73 | "funding": { "@id": "codemeta:funding" }, 74 | "readme": { "@id":"codemeta:readme", "@type": "@id" }, 75 | "issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" }, 76 | "referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"}, 77 | "maintainer": { "@id": "codemeta:maintainer" } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /swh/indexer/data/codemeta/codemeta-3.0.jsonld: -------------------------------------------------------------------------------- 1 | { 2 | "@context": { 3 | "type": "@type", 4 | "id": "@id", 5 | "schema":"http://schema.org/", 6 | "codemeta": "https://codemeta.github.io/terms/", 7 | "Organization": {"@id": "schema:Organization"}, 8 | "Person": {"@id": "schema:Person"}, 9 | "Review": {"@id": "schema:Review"}, 10 | "Role": {"@id": "schema:Role"}, 11 | "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"}, 12 | "SoftwareApplication": {"@id": "schema:SoftwareApplication"}, 13 | "Text": {"@id": "schema:Text"}, 14 | "URL": {"@id": "schema:URL"}, 15 | "address": { "@id": "schema:address"}, 16 | "affiliation": { "@id": "schema:affiliation"}, 17 | "applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"}, 18 | "applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"}, 19 | "citation": { "@id": "schema:citation"}, 20 | "codeRepository": { "@id": "schema:codeRepository", "@type": "@id"}, 21 | "contributor": { "@id": "schema:contributor"}, 22 | "copyrightHolder": { "@id": "schema:copyrightHolder"}, 23 | "copyrightYear": { "@id": "schema:copyrightYear"}, 24 | "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" }, 25 | "dateModified": {"@id": "schema:dateModified", "@type": "schema:Date" }, 26 | "datePublished": {"@id": "schema:datePublished", "@type": "schema:Date" }, 27 | "description": { "@id": "schema:description"}, 28 | "downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"}, 29 | "email": { "@id": "schema:email"}, 30 | "editor": { "@id": "schema:editor"}, 31 | "encoding": { "@id": "schema:encoding"}, 32 | "endDate": { "@id": "schema:endDate"}, 33 | "familyName": { "@id": "schema:familyName"}, 34 | "fileFormat": { "@id": "schema:fileFormat", "@type": "@id"}, 35 | "fileSize": { "@id": "schema:fileSize"}, 36 | "funder": { "@id": "schema:funder"}, 37 | "givenName": { "@id": "schema:givenName"}, 38 | "hasPart": { "@id": "schema:hasPart" }, 39 | "identifier": { "@id": "schema:identifier", "@type": "@id"}, 40 | "installUrl": { "@id": "schema:installUrl", "@type": "@id"}, 41 | "isAccessibleForFree": { "@id": "schema:isAccessibleForFree"}, 42 | "isPartOf": { "@id": "schema:isPartOf"}, 43 | "keywords": { "@id": "schema:keywords"}, 44 | "license": { "@id": "schema:license", "@type": "@id"}, 45 | "memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"}, 46 | "name": { "@id": "schema:name"}, 47 | "operatingSystem": { "@id": "schema:operatingSystem"}, 48 | "permissions": { "@id": "schema:permissions"}, 49 | "position": { "@id": "schema:position"}, 50 | "processorRequirements": { "@id": "schema:processorRequirements"}, 51 | "producer": { "@id": "schema:producer"}, 52 | "programmingLanguage": { "@id": "schema:programmingLanguage"}, 53 | "provider": { "@id": "schema:provider"}, 54 | "publisher": { "@id": "schema:publisher"}, 55 | "relatedLink": { "@id": "schema:relatedLink", "@type": "@id"}, 56 | "review": { "@id": "schema:review", "@type": "@id" }, 57 | "reviewAspect": { "@id": "schema:reviewAspect" }, 58 | "reviewBody": { "@id": "schema:reviewBody" }, 59 | "releaseNotes": { "@id": "schema:releaseNotes"}, 60 | "roleName": { "@id": "schema:roleName"}, 61 | "runtimePlatform": { "@id": "schema:runtimePlatform"}, 62 | "sameAs": { "@id": "schema:sameAs", "@type": "@id"}, 63 | "softwareHelp": { "@id": "schema:softwareHelp"}, 64 | "softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"}, 65 | "softwareVersion": { "@id": "schema:softwareVersion"}, 66 | "sponsor": { "@id": "schema:sponsor"}, 67 | "startDate": { "@id": "schema:startDate"}, 68 | "storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"}, 69 | "supportingData": { "@id": "schema:supportingData"}, 70 | "targetProduct": { "@id": "schema:targetProduct"}, 71 | "url": { "@id": "schema:url", "@type": "@id"}, 72 | "version": { "@id": "schema:version"}, 73 | "author": { "@id": "schema:author", "@container": "@list" }, 74 | 75 | "softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"}, 76 | "continuousIntegration": { "@id": "codemeta:continuousIntegration", "@type": "@id"}, 77 | "buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"}, 78 | "developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"}, 79 | "embargoEndDate": { "@id":"codemeta:embargoEndDate", "@type": "schema:Date" }, 80 | "funding": { "@id": "codemeta:funding" }, 81 | "readme": { "@id":"codemeta:readme", "@type": "@id" }, 82 | "issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" }, 83 | "referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"}, 84 | "maintainer": { "@id": "codemeta:maintainer" }, 85 | "hasSourceCode": { "@id": "codemeta:hasSourceCode", "@type": "@id"}, 86 | "isSourceCodeOf": { "@id": "codemeta:isSourceCodeOf", "@type": "@id"} 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /swh/indexer/data/composer.csv: -------------------------------------------------------------------------------- 1 | Property,Composer 2 | codeRepository,support.source 3 | programmingLanguage, 4 | runtimePlatform, 5 | targetProduct, 6 | applicationCategory, 7 | applicationSubCategory, 8 | downloadUrl, 9 | fileSize, 10 | installUrl, 11 | memoryRequirements, 12 | operatingSystem, 13 | permissions, 14 | processorRequirements, 15 | releaseNotes, 16 | softwareHelp, 17 | softwareRequirements,require 18 | softwareVersion,version 19 | storageRequirements, 20 | supportingData, 21 | author,authors 22 | citation, 23 | contributor, 24 | copyrightHolder, 25 | copyrightYear, 26 | dateCreated, 27 | dateModified, 28 | datePublished, 29 | editor, 30 | encoding, 31 | fileFormat, 32 | funder, 33 | keywords,keywords 34 | license,license 35 | producer, 36 | provider, 37 | publisher, 38 | sponsor, 39 | version,version 40 | isAccessibleForFree, 41 | isPartOf, 42 | hasPart, 43 | position, 44 | description,description 45 | identifier,name 46 | name,name 47 | sameAs, 48 | url,homepage 49 | relatedLink, 50 | givenName, 51 | familyName, 52 | email,author.email 53 | affiliation, 54 | identifier, 55 | name,author.name 56 | address, 57 | type, 58 | id, 59 | softwareSuggestions,suggest 60 | maintainer, 61 | contIntegration, 62 | buildInstructions, 63 | developmentStatus, 64 | embargoDate, 65 | funding, 66 | issueTracker,support.issues 67 | referencePublication, 68 | readme, -------------------------------------------------------------------------------- /swh/indexer/data/nuget.csv: -------------------------------------------------------------------------------- 1 | Property,NuGet 2 | codeRepository,repository.url 3 | programmingLanguage, 4 | runtimePlatform, 5 | targetProduct, 6 | applicationCategory, 7 | applicationSubCategory, 8 | downloadUrl, 9 | fileSize, 10 | installUrl, 11 | memoryRequirements, 12 | operatingSystem, 13 | permissions, 14 | processorRequirements, 15 | releaseNotes,releaseNotes 16 | softwareHelp, 17 | softwareRequirements, 18 | softwareVersion, 19 | storageRequirements, 20 | supportingData, 21 | author,authors 22 | citation, 23 | contributor, 24 | copyrightHolder, 25 | copyrightYear, 26 | dateCreated, 27 | dateModified, 28 | datePublished, 29 | editor, 30 | encoding, 31 | fileFormat, 32 | funder, 33 | keywords,tags 34 | license,license/licenseUrl 35 | producer, 36 | provider, 37 | publisher, 38 | sponsor, 39 | version,version 40 | isAccessibleForFree, 41 | isPartOf, 42 | hasPart, 43 | position, 44 | description,description/summary 45 | identifier, 46 | name,name 47 | sameAs, 48 | url,projectUrl 49 | relatedLink, 50 | givenName, 51 | familyName, 52 | email, 53 | affiliation, 54 | identifier,id 55 | name, 56 | address, 57 | type, 58 | id, 59 | softwareSuggestions, 60 | maintainer, 61 | contIntegration, 62 | buildInstructions, 63 | developmentStatus, 64 | embargoDate, 65 | funding, 66 | issueTracker, 67 | referencePublication, 68 | readme, 69 | -------------------------------------------------------------------------------- /swh/indexer/data/pubspec.csv: -------------------------------------------------------------------------------- 1 | Property,Pubspec 2 | codeRepository,repository 3 | programmingLanguage, 4 | runtimePlatform,platforms 5 | targetProduct, 6 | applicationCategory, 7 | applicationSubCategory, 8 | downloadUrl, 9 | fileSize, 10 | installUrl, 11 | memoryRequirements, 12 | operatingSystem, 13 | permissions, 14 | processorRequirements, 15 | releaseNotes, 16 | softwareHelp, 17 | softwareRequirements, 18 | softwareVersion,version 19 | storageRequirements, 20 | supportingData, 21 | author,author/authors 22 | citation, 23 | contributor, 24 | copyrightHolder, 25 | copyrightYear, 26 | dateCreated, 27 | dateModified, 28 | datePublished, 29 | editor, 30 | encoding, 31 | fileFormat, 32 | funder, 33 | keywords,keywords 34 | license,license 35 | producer, 36 | provider, 37 | publisher, 38 | sponsor, 39 | version,version 40 | isAccessibleForFree, 41 | isPartOf, 42 | hasPart, 43 | position, 44 | description,description 45 | identifier, 46 | name,name 47 | sameAs, 48 | url,homepage 49 | relatedLink, 50 | givenName, 51 | familyName, 52 | email,author.email/authors.email 53 | affiliation, 54 | identifier, 55 | name, 56 | address, 57 | type, 58 | id, 59 | softwareSuggestions, 60 | maintainer, 61 | contIntegration, 62 | buildInstructions, 63 | developmentStatus, 64 | embargoDate, 65 | funding, 66 | issueTracker,issue_tracker 67 | referencePublication, 68 | readme, 69 | -------------------------------------------------------------------------------- /swh/indexer/data/schema.org/CITATION: -------------------------------------------------------------------------------- 1 | swh:1:cnt:8e5a38fb91a1e8ef272f02a43e67dfaf56cb8e6d;origin=https://github.com/schemaorg/schemaorg;visit=swh:1:snp:0115fb30b95e3dd177271313dcaaa898888e7a4c;anchor=swh:1:rev:d9bc1d722034ea2ebd3e3ea97f8f21d3e692b3f0;path=/data/releases/28.0/schemaorgcontext.jsonld 2 | -------------------------------------------------------------------------------- /swh/indexer/fossology_license.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2016-2024 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import logging 7 | import subprocess 8 | from typing import Any, Dict, List, Optional 9 | 10 | import sentry_sdk 11 | 12 | from swh.core.config import merge_configs 13 | from swh.indexer.storage.interface import IndexerStorageInterface 14 | from swh.indexer.storage.model import ContentLicenseRow 15 | from swh.model import hashutil 16 | from swh.objstorage.interface import CompositeObjId 17 | 18 | from .indexer import ContentIndexer, write_to_temp 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def compute_license(path) -> Dict: 24 | """Determine license from file at path. 25 | 26 | Args: 27 | path: filepath to determine the license 28 | 29 | Returns: 30 | dict: A dict with the following keys: 31 | 32 | - licenses ([str]): associated detected licenses to path 33 | - path (bytes): content filepath 34 | 35 | """ 36 | try: 37 | properties = subprocess.check_output(["nomossa", path], universal_newlines=True) 38 | if properties: 39 | res = properties.rstrip().split(" contains license(s) ") 40 | licenses = res[1].split(",") 41 | else: 42 | licenses = [] 43 | 44 | return { 45 | "licenses": licenses, 46 | "path": path, 47 | } 48 | except subprocess.CalledProcessError: 49 | from os import path as __path 50 | 51 | logger.exception( 52 | "Problem during license detection for sha1 %s" % __path.basename(path) 53 | ) 54 | sentry_sdk.capture_exception() 55 | return { 56 | "licenses": [], 57 | "path": path, 58 | } 59 | 60 | 61 | DEFAULT_CONFIG: Dict[str, Any] = { 62 | "workdir": "/tmp/swh/indexer.fossology.license", 63 | "tools": { 64 | "name": "nomos", 65 | "version": "3.1.0rc2-31-ga2cbb8c", 66 | "configuration": { 67 | "command_line": "nomossa ", 68 | }, 69 | }, 70 | "write_batch_size": 1000, 71 | } 72 | 73 | 74 | class MixinFossologyLicenseIndexer: 75 | """Mixin fossology license indexer. 76 | 77 | See :class:`FossologyLicenseIndexer` 78 | 79 | """ 80 | 81 | tool: Any 82 | idx_storage: IndexerStorageInterface 83 | 84 | def __init__(self, *args, **kwargs): 85 | super().__init__(*args, **kwargs) 86 | self.config = merge_configs(DEFAULT_CONFIG, self.config) 87 | self.working_directory = self.config["workdir"] 88 | 89 | def index( 90 | self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs 91 | ) -> List[ContentLicenseRow]: 92 | """Index sha1s' content and store result. 93 | 94 | Args: 95 | id (bytes): content's identifier 96 | raw_content (bytes): associated raw content to content id 97 | 98 | Returns: 99 | dict: A dict, representing a content_license, with keys: 100 | 101 | - id (bytes): content's identifier (sha1) 102 | - license (bytes): license in bytes 103 | - path (bytes): path 104 | - indexer_configuration_id (int): tool used to compute the output 105 | 106 | """ 107 | assert data is not None 108 | with write_to_temp( 109 | filename=hashutil.hash_to_hex(id["sha1"]), # use the id as pathname 110 | data=data, 111 | working_directory=self.working_directory, 112 | ) as content_path: 113 | properties = compute_license(path=content_path) 114 | return [ 115 | ContentLicenseRow( 116 | id=id["sha1"], 117 | indexer_configuration_id=self.tool["id"], 118 | license=license, 119 | ) 120 | for license in properties["licenses"] 121 | ] 122 | 123 | def persist_index_computations( 124 | self, results: List[ContentLicenseRow] 125 | ) -> Dict[str, int]: 126 | """Persist the results in storage. 127 | 128 | Args: 129 | results: list of content_license dict with the 130 | following keys: 131 | 132 | - id (bytes): content's identifier (sha1) 133 | - license (bytes): license in bytes 134 | - path (bytes): path 135 | 136 | """ 137 | return self.idx_storage.content_fossology_license_add(results) 138 | 139 | 140 | class FossologyLicenseIndexer( 141 | MixinFossologyLicenseIndexer, ContentIndexer[ContentLicenseRow] 142 | ): 143 | """Indexer in charge of: 144 | 145 | - filtering out content already indexed 146 | - reading content from objstorage per the content's id (sha1) 147 | - computing {license, encoding} from that content 148 | - store result in storage 149 | 150 | """ 151 | 152 | pass 153 | -------------------------------------------------------------------------------- /swh/indexer/metadata_detector.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2024 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from typing import Dict, List 7 | 8 | from swh.indexer.metadata_dictionary import INTRINSIC_MAPPINGS 9 | from swh.indexer.metadata_dictionary.base import DirectoryLsEntry 10 | from swh.objstorage.interface import CompositeObjId 11 | 12 | 13 | def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[CompositeObjId]]: 14 | """ 15 | Detects files potentially containing metadata 16 | 17 | Args: 18 | file_entries (list): list of files 19 | 20 | Returns: 21 | dict: {mapping_filenames[name]:f['sha1']} (may be empty) 22 | """ 23 | results = {} 24 | for mapping_name, mapping in INTRINSIC_MAPPINGS.items(): 25 | matches = mapping.detect_metadata_files(files) 26 | if matches: 27 | results[mapping_name] = matches 28 | return results 29 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import collections 7 | from typing import Dict, Type 8 | 9 | import click 10 | 11 | from . import ( 12 | cff, 13 | codemeta, 14 | composer, 15 | dart, 16 | gitea, 17 | github, 18 | maven, 19 | npm, 20 | nuget, 21 | python, 22 | ruby, 23 | ) 24 | from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping 25 | 26 | INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { 27 | "CffMapping": cff.CffMapping, 28 | "CodemetaMapping": codemeta.CodemetaMapping, 29 | "GemspecMapping": ruby.GemspecMapping, 30 | "MavenMapping": maven.MavenMapping, 31 | "NpmMapping": npm.NpmMapping, 32 | "PubMapping": dart.PubspecMapping, 33 | "PythonPkginfoMapping": python.PythonPkginfoMapping, 34 | "ComposerMapping": composer.ComposerMapping, 35 | "NuGetMapping": nuget.NuGetMapping, 36 | } 37 | 38 | EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { 39 | "GiteaMapping": gitea.GiteaMapping, 40 | "GitHubMapping": github.GitHubMapping, 41 | "JsonSwordCodemetaMapping": codemeta.JsonSwordCodemetaMapping, 42 | "SwordCodemetaMapping": codemeta.SwordCodemetaMapping, 43 | } 44 | 45 | 46 | MAPPINGS: Dict[str, Type[BaseMapping]] = {**INTRINSIC_MAPPINGS, **EXTRINSIC_MAPPINGS} 47 | 48 | 49 | def list_terms(): 50 | """Returns a dictionary with all supported CodeMeta terms as keys, 51 | and the mappings that support each of them as values.""" 52 | d = collections.defaultdict(set) 53 | for mapping in MAPPINGS.values(): 54 | for term in mapping.supported_terms(): 55 | d[term].add(mapping) 56 | return d 57 | 58 | 59 | @click.command() 60 | @click.argument("mapping_name") 61 | @click.argument("file_name") 62 | def main(mapping_name: str, file_name: str): 63 | from pprint import pprint 64 | 65 | with open(file_name, "rb") as fd: 66 | file_content = fd.read() 67 | res = MAPPINGS[mapping_name]().translate(file_content) 68 | pprint(res) 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/cff.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021-2024 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from typing import Any, Dict, List 7 | import urllib.parse 8 | 9 | from rdflib import BNode, Graph, Literal, URIRef 10 | import rdflib.term 11 | 12 | from swh.indexer.codemeta import CROSSWALK_TABLE 13 | from swh.indexer.namespaces import RDF, SCHEMA 14 | 15 | from .base import SingleFileIntrinsicMapping, YamlMapping 16 | from .utils import add_map 17 | 18 | DOI = URIRef("https://doi.org/") 19 | SPDX = URIRef("https://spdx.org/licenses/") 20 | 21 | 22 | class CffMapping(YamlMapping, SingleFileIntrinsicMapping): 23 | """Dedicated class for Citation (CITATION.cff) mapping and translation""" 24 | 25 | name = "cff" 26 | filename = b"CITATION.cff" 27 | mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"] 28 | string_fields = ["title", "keywords", "license", "abstract", "version", "doi"] 29 | date_fields = ["date-released"] 30 | uri_fields = ["url", "repository-code"] 31 | 32 | def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node: 33 | node: rdflib.term.Node 34 | if ( 35 | "orcid" in author 36 | and isinstance(author["orcid"], str) 37 | and urllib.parse.urlparse(author["orcid"]).netloc 38 | ): 39 | node = URIRef(author["orcid"].strip()) 40 | else: 41 | node = BNode() 42 | graph.add((node, RDF.type, SCHEMA.Person)) 43 | if "affiliation" in author and isinstance(author["affiliation"], str): 44 | affiliation = BNode() 45 | graph.add((node, SCHEMA.affiliation, affiliation)) 46 | graph.add((affiliation, RDF.type, SCHEMA.Organization)) 47 | graph.add((affiliation, SCHEMA.name, Literal(author["affiliation"]))) 48 | if "family-names" in author and isinstance(author["family-names"], str): 49 | graph.add((node, SCHEMA.familyName, Literal(author["family-names"]))) 50 | if "given-names" in author and isinstance(author["given-names"], str): 51 | graph.add((node, SCHEMA.givenName, Literal(author["given-names"]))) 52 | return node 53 | 54 | def translate_authors( 55 | self, graph: Graph, root: URIRef, authors: List[dict] 56 | ) -> None: 57 | add_map(graph, root, SCHEMA.author, self._translate_author, authors) 58 | 59 | def normalize_doi(self, s: str) -> URIRef: 60 | if isinstance(s, str): 61 | return DOI + s 62 | 63 | def normalize_license(self, s: str) -> URIRef: 64 | if isinstance(s, str): 65 | return SPDX + s 66 | 67 | def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: 68 | # https://github.com/citation-file-format/citation-file-format/blob/main/schema-guide.md#credit-redirection 69 | return super()._translate_dict( 70 | content_dict.get("preferred-citation", content_dict) 71 | ) 72 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/composer.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import os.path 7 | from typing import Optional 8 | 9 | from rdflib import BNode, Graph, Literal, URIRef 10 | 11 | from swh.indexer.codemeta import _DATA_DIR, read_crosstable 12 | from swh.indexer.namespaces import RDF, SCHEMA 13 | 14 | from .base import JsonMapping, SingleFileIntrinsicMapping 15 | from .utils import add_map 16 | 17 | SPDX = URIRef("https://spdx.org/licenses/") 18 | 19 | 20 | COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv") 21 | 22 | with open(COMPOSER_TABLE_PATH) as fd: 23 | (CODEMETA_TERMS, COMPOSER_TABLE) = read_crosstable(fd) 24 | 25 | 26 | class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping): 27 | """Dedicated class for Packagist(composer.json) mapping and translation""" 28 | 29 | name = "composer" 30 | mapping = COMPOSER_TABLE["Composer"] 31 | filename = b"composer.json" 32 | string_fields = [ 33 | "name", 34 | "description", 35 | "version", 36 | "keywords", 37 | "license", 38 | "author", 39 | "authors", 40 | ] 41 | uri_fields = ["homepage"] 42 | 43 | def normalize_license(self, s): 44 | if isinstance(s, str): 45 | return SPDX + s 46 | 47 | def _translate_author(self, graph: Graph, author) -> Optional[BNode]: 48 | if not isinstance(author, dict): 49 | return None 50 | node = BNode() 51 | graph.add((node, RDF.type, SCHEMA.Person)) 52 | 53 | if isinstance(author.get("name"), str): 54 | graph.add((node, SCHEMA.name, Literal(author["name"]))) 55 | if isinstance(author.get("email"), str): 56 | graph.add((node, SCHEMA.email, Literal(author["email"]))) 57 | 58 | return node 59 | 60 | def translate_authors(self, graph: Graph, root: URIRef, authors) -> None: 61 | add_map(graph, root, SCHEMA.author, self._translate_author, authors) 62 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/dart.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import os.path 7 | import re 8 | 9 | from rdflib import RDF, BNode, Graph, Literal, URIRef 10 | 11 | from swh.indexer.codemeta import _DATA_DIR, read_crosstable 12 | from swh.indexer.namespaces import SCHEMA 13 | 14 | from .base import SingleFileIntrinsicMapping, YamlMapping 15 | from .utils import add_map 16 | 17 | SPDX = URIRef("https://spdx.org/licenses/") 18 | 19 | PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv") 20 | 21 | with open(PUB_TABLE_PATH) as fd: 22 | (CODEMETA_TERMS, PUB_TABLE) = read_crosstable(fd) 23 | 24 | 25 | def name_to_person(name): 26 | return { 27 | "@type": SCHEMA.Person, 28 | SCHEMA.name: name, 29 | } 30 | 31 | 32 | class PubspecMapping(YamlMapping, SingleFileIntrinsicMapping): 33 | name = "pubspec" 34 | filename = b"pubspec.yaml" 35 | mapping = PUB_TABLE["Pubspec"] 36 | string_fields = [ 37 | "repository", 38 | "keywords", 39 | "description", 40 | "name", 41 | "issue_tracker", 42 | "platforms", 43 | "license", 44 | # license will only be used with the SPDX Identifier 45 | ] 46 | uri_fields = ["homepage"] 47 | 48 | def normalize_license(self, s): 49 | if isinstance(s, str): 50 | return SPDX + s 51 | 52 | def _translate_author(self, graph, s): 53 | name_email_re = re.compile("(?P.*?)( <(?P.*)>)") 54 | if isinstance(s, str): 55 | author = BNode() 56 | graph.add((author, RDF.type, SCHEMA.Person)) 57 | match = name_email_re.search(s) 58 | if match: 59 | name = match.group("name") 60 | email = match.group("email") 61 | graph.add((author, SCHEMA.email, Literal(email))) 62 | else: 63 | name = s 64 | 65 | graph.add((author, SCHEMA.name, Literal(name))) 66 | 67 | return author 68 | 69 | def translate_author(self, graph: Graph, root, s) -> None: 70 | add_map(graph, root, SCHEMA.author, self._translate_author, [s]) 71 | 72 | def translate_authors(self, graph: Graph, root, authors) -> None: 73 | if isinstance(authors, list): 74 | add_map(graph, root, SCHEMA.author, self._translate_author, authors) 75 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/nuget.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import os.path 7 | import re 8 | from typing import Any, Dict 9 | 10 | from rdflib import RDF, BNode, Graph, Literal, URIRef 11 | 12 | from swh.indexer.codemeta import _DATA_DIR, read_crosstable 13 | from swh.indexer.namespaces import SCHEMA 14 | 15 | from .base import SingleFileIntrinsicMapping, XmlMapping 16 | from .utils import add_list, add_url_if_valid 17 | 18 | NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") 19 | 20 | with open(NUGET_TABLE_PATH) as fd: 21 | (CODEMETA_TERMS, NUGET_TABLE) = read_crosstable(fd) 22 | 23 | SPDX = URIRef("https://spdx.org/licenses/") 24 | 25 | 26 | class NuGetMapping(XmlMapping, SingleFileIntrinsicMapping): 27 | """ 28 | dedicated class for NuGet (.nuspec) mapping and translation 29 | """ 30 | 31 | name = "nuget" 32 | filename = re.compile(rb".*\.nuspec") 33 | mapping = NUGET_TABLE["NuGet"] 34 | mapping["copyright"] = URIRef("http://schema.org/copyrightNotice") 35 | mapping["language"] = URIRef("http://schema.org/inLanguage") 36 | string_fields = [ 37 | "description", 38 | "version", 39 | "name", 40 | "tags", 41 | "license", 42 | "summary", 43 | "copyright", 44 | "language", 45 | ] 46 | uri_fields = ["projectUrl", "licenseUrl"] 47 | 48 | def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]: 49 | return super()._translate_dict(d.get("package", {}).get("metadata", {})) 50 | 51 | def translate_repository(self, graph, root, v): 52 | if isinstance(v, dict) and isinstance(v["@url"], str): 53 | codemeta_key = URIRef(self.mapping["repository.url"]) 54 | add_url_if_valid(graph, root, codemeta_key, v["@url"]) 55 | 56 | def normalize_license(self, v): 57 | if isinstance(v, dict) and v["@type"] == "expression": 58 | license_string = v["#text"] 59 | if not bool( 60 | re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE) 61 | ): 62 | return [ 63 | SPDX + license_type.strip() 64 | for license_type in re.split( 65 | r" or ", license_string, flags=re.IGNORECASE 66 | ) 67 | ] 68 | else: 69 | return None 70 | 71 | def translate_authors(self, graph: Graph, root, s): 72 | if isinstance(s, str): 73 | authors = [] 74 | for author_name in s.split(","): 75 | author_name = author_name.strip() 76 | author = BNode() 77 | graph.add((author, RDF.type, SCHEMA.Person)) 78 | graph.add((author, SCHEMA.name, Literal(author_name))) 79 | authors.append(author) 80 | add_list(graph, root, SCHEMA.author, authors) 81 | 82 | def translate_releaseNotes(self, graph: Graph, root, s): 83 | if isinstance(s, str): 84 | graph.add((root, SCHEMA.releaseNotes, Literal(s))) 85 | 86 | def normalize_tags(self, s): 87 | if isinstance(s, str): 88 | return [Literal(tag) for tag in s.split(" ")] 89 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/python.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018-2021 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import email.parser 7 | import email.policy 8 | 9 | from rdflib import BNode, Literal, URIRef 10 | 11 | from swh.indexer.codemeta import CROSSWALK_TABLE 12 | from swh.indexer.namespaces import RDF, SCHEMA 13 | 14 | from .base import DictMapping, SingleFileIntrinsicMapping 15 | from .utils import add_list 16 | 17 | _normalize_pkginfo_key = str.lower 18 | 19 | 20 | class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): 21 | def header_fetch_parse(self, name, value): 22 | if hasattr(value, "name"): 23 | return value 24 | value = value.replace("\n ", "\n") 25 | return self.header_factory(name, value) 26 | 27 | 28 | class PythonPkginfoMapping(DictMapping, SingleFileIntrinsicMapping): 29 | """Dedicated class for Python's PKG-INFO mapping and translation. 30 | 31 | https://www.python.org/dev/peps/pep-0314/""" 32 | 33 | name = "pkg-info" 34 | filename = b"PKG-INFO" 35 | mapping = { 36 | _normalize_pkginfo_key(k): v 37 | for (k, v) in CROSSWALK_TABLE["Python PKG-INFO"].items() 38 | } 39 | string_fields = [ 40 | "name", 41 | "version", 42 | "description", 43 | "summary", 44 | "author", 45 | "author-email", 46 | ] 47 | 48 | _parser = email.parser.BytesHeaderParser(policy=LinebreakPreservingEmailPolicy()) 49 | 50 | def translate(self, content): 51 | msg = self._parser.parsebytes(content) 52 | d = {} 53 | for key, value in msg.items(): 54 | key = _normalize_pkginfo_key(key) 55 | if value != "UNKNOWN": 56 | d.setdefault(key, []).append(value) 57 | return self._translate_dict(d) 58 | 59 | def extra_translation(self, graph, root, d): 60 | author_names = list(graph.triples((root, SCHEMA.author, None))) 61 | author_emails = list(graph.triples((root, SCHEMA.email, None))) 62 | graph.remove((root, SCHEMA.author, None)) 63 | graph.remove((root, SCHEMA.email, None)) 64 | if author_names or author_emails: 65 | author = BNode() 66 | graph.add((author, RDF.type, SCHEMA.Person)) 67 | for _, _, author_name in author_names: 68 | graph.add((author, SCHEMA.name, author_name)) 69 | for _, _, author_email in author_emails: 70 | graph.add((author, SCHEMA.email, author_email)) 71 | add_list(graph, root, SCHEMA.author, [author]) 72 | 73 | def normalize_home_page(self, urls): 74 | return [URIRef(url) for url in urls] 75 | 76 | def normalize_keywords(self, keywords): 77 | return [Literal(keyword) for s in keywords for keyword in s.split(" ")] 78 | 79 | def normalize_license(self, licenses): 80 | return [URIRef("https://spdx.org/licenses/" + license) for license in licenses] 81 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/ruby.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import ast 7 | import itertools 8 | import re 9 | 10 | from rdflib import RDF, BNode, Graph, Literal, URIRef 11 | 12 | from swh.indexer.codemeta import CROSSWALK_TABLE 13 | from swh.indexer.namespaces import SCHEMA 14 | 15 | from .base import DictMapping, SingleFileIntrinsicMapping 16 | from .utils import add_map 17 | 18 | SPDX = URIRef("https://spdx.org/licenses/") 19 | 20 | 21 | def name_to_person(graph: Graph, name): 22 | if not isinstance(name, str): 23 | return None 24 | author = BNode() 25 | graph.add((author, RDF.type, SCHEMA.Person)) 26 | graph.add((author, SCHEMA.name, Literal(name))) 27 | return author 28 | 29 | 30 | class GemspecMapping(DictMapping, SingleFileIntrinsicMapping): 31 | name = "gemspec" 32 | filename = re.compile(rb".*\.gemspec") 33 | mapping = CROSSWALK_TABLE["Ruby Gem"] 34 | string_fields = ["name", "version", "description", "summary", "email"] 35 | uri_fields = ["homepage"] 36 | 37 | _re_spec_new = re.compile(r".*Gem::Specification.new +(do|\{) +\|.*\|.*") 38 | _re_spec_entry = re.compile(r"\s*\w+\.(?P\w+)\s*=\s*(?P.*)") 39 | 40 | def translate(self, raw_content): 41 | try: 42 | raw_content = raw_content.decode() 43 | except UnicodeDecodeError: 44 | self.log.warning("Error unidecoding from %s", self.log_suffix) 45 | return 46 | 47 | # Skip lines before 'Gem::Specification.new' 48 | lines = itertools.dropwhile( 49 | lambda x: not self._re_spec_new.match(x), raw_content.split("\n") 50 | ) 51 | 52 | try: 53 | next(lines) # Consume 'Gem::Specification.new' 54 | except StopIteration: 55 | self.log.warning("Could not find Gem::Specification in %s", self.log_suffix) 56 | return 57 | 58 | content_dict = {} 59 | for line in lines: 60 | match = self._re_spec_entry.match(line) 61 | if match: 62 | value = self.eval_ruby_expression(match.group("expr")) 63 | if value: 64 | content_dict[match.group("key")] = value 65 | return self._translate_dict(content_dict) 66 | 67 | def eval_ruby_expression(self, expr): 68 | """Very simple evaluator of Ruby expressions. 69 | 70 | >>> GemspecMapping().eval_ruby_expression('"Foo bar"') 71 | 'Foo bar' 72 | >>> GemspecMapping().eval_ruby_expression("'Foo bar'") 73 | 'Foo bar' 74 | >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") 75 | ['Foo', 'bar'] 76 | >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") 77 | 'Foo bar' 78 | >>> GemspecMapping().eval_ruby_expression( \ 79 | "['Foo'.freeze, 'bar'.freeze]") 80 | ['Foo', 'bar'] 81 | """ 82 | 83 | def evaluator(node): 84 | if isinstance(node, ast.Str): 85 | return node.s 86 | elif isinstance(node, ast.List): 87 | res = [] 88 | for element in node.elts: 89 | val = evaluator(element) 90 | if not val: 91 | return 92 | res.append(val) 93 | return res 94 | 95 | expr = expr.replace(".freeze", "") 96 | try: 97 | # We're parsing Ruby expressions here, but Python's 98 | # ast.parse works for very simple Ruby expressions 99 | # (mainly strings delimited with " or ', and lists 100 | # of such strings). 101 | tree = ast.parse(expr, mode="eval") 102 | except (SyntaxError, ValueError): 103 | return 104 | if isinstance(tree, ast.Expression): 105 | return evaluator(tree.body) 106 | 107 | def normalize_license(self, s): 108 | if isinstance(s, str): 109 | return SPDX + s 110 | 111 | def normalize_licenses(self, licenses): 112 | if isinstance(licenses, list): 113 | return [SPDX + license for license in licenses if isinstance(license, str)] 114 | 115 | def translate_author(self, graph: Graph, root, author): 116 | if isinstance(author, str): 117 | add_map(graph, root, SCHEMA.author, name_to_person, [author]) 118 | 119 | def translate_authors(self, graph: Graph, root, authors): 120 | if isinstance(authors, list): 121 | add_map(graph, root, SCHEMA.author, name_to_person, authors) 122 | -------------------------------------------------------------------------------- /swh/indexer/metadata_dictionary/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | 7 | import json 8 | from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar 9 | import urllib.parse 10 | 11 | from pyld import jsonld 12 | from rdflib import RDF, Graph, URIRef 13 | import rdflib.term 14 | 15 | from swh.indexer.codemeta import _document_loader 16 | 17 | 18 | def prettyprint_graph(graph: Graph, root: URIRef): 19 | s = graph.serialize(format="application/ld+json") 20 | jsonld_graph = json.loads(s) 21 | translated_metadata = jsonld.frame( 22 | jsonld_graph, 23 | {"@id": str(root)}, 24 | options={ 25 | "documentLoader": _document_loader, 26 | "processingMode": "json-ld-1.1", 27 | }, 28 | ) 29 | print(json.dumps(translated_metadata, indent=4)) 30 | 31 | 32 | def add_list( 33 | graph: Graph, 34 | subject: rdflib.term.Node, 35 | predicate: rdflib.term.Identifier, 36 | objects: Sequence[rdflib.term.Node], 37 | ) -> None: 38 | """Adds triples to the ``graph`` so that they are equivalent to this 39 | JSON-LD object:: 40 | 41 | { 42 | "@id": subject, 43 | predicate: {"@list": objects} 44 | } 45 | 46 | This is a naive implementation of 47 | https://json-ld.org/spec/latest/json-ld-api/#list-to-rdf-conversion 48 | """ 49 | # JSON-LD's @list is syntactic sugar for a linked list / chain in the RDF graph, 50 | # which is what we are going to construct, starting from the end: 51 | last_link: rdflib.term.Node 52 | last_link = RDF.nil 53 | for item in reversed(objects): 54 | link = rdflib.BNode() 55 | graph.add((link, RDF.first, item)) 56 | graph.add((link, RDF.rest, last_link)) 57 | last_link = link 58 | graph.add((subject, predicate, last_link)) 59 | 60 | 61 | TValue = TypeVar("TValue") 62 | 63 | 64 | def add_map( 65 | graph: Graph, 66 | subject: rdflib.term.Node, 67 | predicate: rdflib.term.Identifier, 68 | f: Callable[[Graph, TValue], Optional[rdflib.term.Node]], 69 | values: Iterable[TValue], 70 | ) -> None: 71 | """Helper for :func:`add_list` that takes a mapper function ``f``.""" 72 | nodes = [f(graph, value) for value in values] 73 | add_list(graph, subject, predicate, [node for node in nodes if node]) 74 | 75 | 76 | def add_url_if_valid( 77 | graph: Graph, 78 | subject: rdflib.term.Node, 79 | predicate: rdflib.term.Identifier, 80 | url: Any, 81 | ) -> None: 82 | """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed. 83 | 84 | This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91 85 | to drop URLs that are blatantly invalid early, so PyLD does not crash. 86 | 87 | >>> from pprint import pprint 88 | >>> graph = Graph() 89 | >>> subject = rdflib.term.URIRef("http://example.org/test-software") 90 | >>> predicate = rdflib.term.URIRef("http://schema.org/license") 91 | >>> add_url_if_valid( 92 | ... graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt" 93 | ... ) 94 | >>> add_url_if_valid( 95 | ... graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt" 96 | ... ) 97 | >>> add_url_if_valid( 98 | ... graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt" 99 | ... ) 100 | >>> add_url_if_valid( 101 | ... graph, subject, predicate, 42 102 | ... ) 103 | >>> pprint(set(graph.triples((subject, predicate, None)))) 104 | {(rdflib.term.URIRef('http://example.org/test-software'), 105 | rdflib.term.URIRef('http://schema.org/license'), 106 | rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))} 107 | """ 108 | if not isinstance(url, str): 109 | return 110 | try: 111 | parsed_url = urllib.parse.urlparse(url) 112 | except Exception: 113 | return 114 | if " " in url or not parsed_url.netloc: 115 | return 116 | graph.add((subject, predicate, rdflib.term.URIRef(url))) 117 | -------------------------------------------------------------------------------- /swh/indexer/mimetype.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2016-2024 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from typing import Any, Dict, List, Optional 7 | 8 | import magic 9 | 10 | from swh.core.config import merge_configs 11 | from swh.indexer.storage.interface import IndexerStorageInterface 12 | from swh.indexer.storage.model import ContentMimetypeRow 13 | from swh.objstorage.interface import CompositeObjId 14 | 15 | from .indexer import ContentIndexer 16 | 17 | if not hasattr(magic.Magic, "from_buffer"): 18 | raise ImportError( 19 | 'Expected "import magic" to import python-magic, but file_magic ' 20 | "was imported instead." 21 | ) 22 | 23 | 24 | def compute_mimetype_encoding(raw_content: bytes) -> Dict[str, str]: 25 | """Determine mimetype and encoding from the raw content. 26 | 27 | Args: 28 | raw_content: content's raw data 29 | 30 | Returns: 31 | dict: mimetype and encoding key and corresponding values. 32 | 33 | """ 34 | m = magic.Magic(mime=True, mime_encoding=True) 35 | res = m.from_buffer(raw_content) 36 | try: 37 | mimetype, encoding = res.split("; charset=") 38 | except ValueError: 39 | mimetype, encoding = res, "" 40 | return { 41 | "mimetype": mimetype, 42 | "encoding": encoding, 43 | } 44 | 45 | 46 | DEFAULT_CONFIG: Dict[str, Any] = { 47 | "tools": { 48 | "name": "file", 49 | "version": "1:5.30-1+deb9u1", 50 | "configuration": {"type": "library", "debian-package": "python3-magic"}, 51 | }, 52 | "write_batch_size": 1000, 53 | } 54 | 55 | 56 | class MixinMimetypeIndexer: 57 | """Mixin mimetype indexer. 58 | 59 | See :class:`MimetypeIndexer` 60 | 61 | """ 62 | 63 | tool: Any 64 | idx_storage: IndexerStorageInterface 65 | 66 | def __init__(self, *args, **kwargs): 67 | super().__init__(*args, **kwargs) 68 | self.config = merge_configs(DEFAULT_CONFIG, self.config) 69 | 70 | def index( 71 | self, id: CompositeObjId, data: Optional[bytes] = None, **kwargs 72 | ) -> List[ContentMimetypeRow]: 73 | """Index sha1s' content and store result. 74 | 75 | Args: 76 | id: content's identifier 77 | data: raw content in bytes 78 | 79 | Returns: 80 | dict: content's mimetype; dict keys being 81 | 82 | - id: content's identifier (sha1) 83 | - mimetype: mimetype in bytes 84 | - encoding: encoding in bytes 85 | 86 | """ 87 | assert data is not None 88 | properties = compute_mimetype_encoding(data) 89 | return [ 90 | ContentMimetypeRow( 91 | id=id["sha1"], 92 | indexer_configuration_id=self.tool["id"], 93 | mimetype=properties["mimetype"], 94 | encoding=properties["encoding"], 95 | ) 96 | ] 97 | 98 | def persist_index_computations( 99 | self, results: List[ContentMimetypeRow] 100 | ) -> Dict[str, int]: 101 | """Persist the results in storage. 102 | 103 | Args: 104 | results: list of content's mimetype dicts 105 | (see :meth:`.index`) 106 | 107 | """ 108 | return self.idx_storage.content_mimetype_add(results) 109 | 110 | 111 | class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]): 112 | """Mimetype Indexer working on list of content identifiers. 113 | 114 | It: 115 | 116 | - (optionally) filters out content already indexed (cf. 117 | :meth:`.filter`) 118 | - reads content from objstorage per the content's id (sha1) 119 | - computes {mimetype, encoding} from that content 120 | - stores result in storage 121 | 122 | """ 123 | 124 | def filter(self, ids: List[CompositeObjId]): 125 | """Filter out known sha1s and return only missing ones.""" 126 | yield from self.idx_storage.content_mimetype_missing( 127 | ( 128 | { 129 | "id": id["sha1"], 130 | "indexer_configuration_id": self.tool["id"], 131 | } 132 | for id in ids 133 | ) 134 | ) 135 | -------------------------------------------------------------------------------- /swh/indexer/namespaces.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from rdflib import Namespace as _Namespace 7 | from rdflib import RDF # noqa 8 | 9 | SCHEMA = _Namespace("http://schema.org/") 10 | CODEMETA = _Namespace("https://codemeta.github.io/terms/") 11 | FORGEFED = _Namespace("https://forgefed.org/ns#") 12 | ACTIVITYSTREAMS = _Namespace("https://www.w3.org/ns/activitystreams#") 13 | SPDX_LICENSES = _Namespace("https://spdx.org/licenses/") 14 | XSD = _Namespace("http://www.w3.org/2001/XMLSchema#") 15 | -------------------------------------------------------------------------------- /swh/indexer/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | -------------------------------------------------------------------------------- /swh/indexer/storage/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/storage/api/__init__.py -------------------------------------------------------------------------------- /swh/indexer/storage/api/client.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2018 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from swh.core.api import RPCClient 7 | from swh.indexer.storage.exc import ( 8 | DuplicateId, 9 | IndexerStorageAPIError, 10 | IndexerStorageArgumentException, 11 | ) 12 | 13 | from ..interface import IndexerStorageInterface 14 | from .serializers import DECODERS, ENCODERS 15 | 16 | 17 | class RemoteStorage(RPCClient): 18 | """Proxy to a remote storage API""" 19 | 20 | backend_class = IndexerStorageInterface 21 | api_exception = IndexerStorageAPIError 22 | reraise_exceptions = [IndexerStorageArgumentException, DuplicateId] 23 | extra_type_decoders = DECODERS 24 | extra_type_encoders = ENCODERS 25 | -------------------------------------------------------------------------------- /swh/indexer/storage/api/serializers.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | """Decoder and encoders for swh-model objects.""" 7 | 8 | from typing import Callable, Dict, List, Tuple 9 | 10 | import swh.indexer.storage.model as idx_model 11 | 12 | 13 | def _encode_model_object(obj): 14 | d = obj.to_dict() 15 | d["__type__"] = type(obj).__name__ 16 | return d 17 | 18 | 19 | ENCODERS: List[Tuple[type, str, Callable]] = [ 20 | (idx_model.BaseRow, "idx_model", _encode_model_object), 21 | ] 22 | 23 | 24 | DECODERS: Dict[str, Callable] = { 25 | "idx_model": lambda d: getattr(idx_model, d.pop("__type__")).from_dict(d), 26 | } 27 | -------------------------------------------------------------------------------- /swh/indexer/storage/api/server.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2020 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import logging 7 | import os 8 | from typing import Any, Dict, Optional 9 | import warnings 10 | 11 | from swh.core import config 12 | from swh.core.api import RPCServerApp 13 | from swh.core.api import encode_data_server as encode_data 14 | from swh.core.api import error_handler 15 | from swh.indexer.storage import INDEXER_CFG_KEY, get_indexer_storage 16 | from swh.indexer.storage.exc import IndexerStorageArgumentException 17 | from swh.indexer.storage.interface import IndexerStorageInterface 18 | 19 | from .serializers import DECODERS, ENCODERS 20 | 21 | 22 | def get_storage(): 23 | global storage 24 | if not storage: 25 | storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY]) 26 | 27 | return storage 28 | 29 | 30 | class IndexerStorageServerApp(RPCServerApp): 31 | extra_type_decoders = DECODERS 32 | extra_type_encoders = ENCODERS 33 | 34 | 35 | app = IndexerStorageServerApp( 36 | __name__, backend_class=IndexerStorageInterface, backend_factory=get_storage 37 | ) 38 | storage = None 39 | 40 | 41 | @app.errorhandler(Exception) 42 | def my_error_handler(exception): 43 | return error_handler(exception, encode_data) 44 | 45 | 46 | @app.errorhandler(IndexerStorageArgumentException) 47 | def argument_error_handler(exception): 48 | return error_handler(exception, encode_data, status_code=400) 49 | 50 | 51 | @app.route("/") 52 | def index(): 53 | return "SWH Indexer Storage API server" 54 | 55 | 56 | api_cfg = None 57 | 58 | 59 | def load_and_check_config( 60 | config_path: Optional[str], 61 | ) -> Dict[str, Any]: 62 | """Check the minimal configuration is set to run the api or raise an 63 | error explanation. 64 | 65 | Args: 66 | config_path: Path to the configuration file to load 67 | cls: backend class (as declared in swh.indexer.classes entry point) 68 | 69 | Raises: 70 | Error if the setup is not as expected 71 | 72 | Returns: 73 | configuration as a dict 74 | 75 | """ 76 | if not config_path: 77 | raise EnvironmentError("Configuration file must be defined") 78 | 79 | if not os.path.exists(config_path): 80 | raise FileNotFoundError(f"Configuration file {config_path} does not exist") 81 | 82 | cfg = config.read(config_path) 83 | if "indexer.storage" in cfg: 84 | warnings.warn( 85 | "The 'indexer.storage' configuration section should be renamed " 86 | "as 'indexer_storage'", 87 | DeprecationWarning, 88 | ) 89 | cfg["indexer_storage"] = cfg.pop("indexer.storage") 90 | if "indexer_storage" not in cfg: 91 | raise KeyError("Missing '%indexer_storage' configuration") 92 | 93 | return cfg 94 | 95 | 96 | def make_app_from_configfile(): 97 | """Run the WSGI app from the webserver, loading the configuration from 98 | a configuration file. 99 | 100 | SWH_CONFIG_FILENAME environment variable defines the 101 | configuration path to load. 102 | 103 | """ 104 | global api_cfg 105 | if not api_cfg: 106 | config_path = os.environ.get("SWH_CONFIG_FILENAME") 107 | api_cfg = load_and_check_config(config_path) 108 | app.config.update(api_cfg) 109 | handler = logging.StreamHandler() 110 | app.logger.addHandler(handler) 111 | return app 112 | 113 | 114 | if __name__ == "__main__": 115 | print("Deprecated. Use swh-indexer") 116 | -------------------------------------------------------------------------------- /swh/indexer/storage/converters.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | 7 | def db_to_mimetype(mimetype): 8 | """Convert a mimetype entry into a ready mimetype output.""" 9 | return { 10 | "id": mimetype["id"], 11 | "encoding": mimetype["encoding"], 12 | "mimetype": mimetype["mimetype"], 13 | "tool": { 14 | "id": mimetype["tool_id"], 15 | "name": mimetype["tool_name"], 16 | "version": mimetype["tool_version"], 17 | "configuration": mimetype["tool_configuration"], 18 | }, 19 | } 20 | 21 | 22 | def db_to_metadata(metadata): 23 | """Convert a metadata entry into a ready metadata output.""" 24 | metadata["tool"] = { 25 | "id": metadata["tool_id"], 26 | "name": metadata["tool_name"], 27 | "version": metadata["tool_version"], 28 | "configuration": metadata["tool_configuration"], 29 | } 30 | del metadata["tool_id"], metadata["tool_configuration"] 31 | del metadata["tool_version"], metadata["tool_name"] 32 | return metadata 33 | 34 | 35 | def db_to_fossology_license(license): 36 | return { 37 | "id": license["id"], 38 | "license": license["license"], 39 | "tool": { 40 | "id": license["tool_id"], 41 | "name": license["tool_name"], 42 | "version": license["tool_version"], 43 | "configuration": license["tool_configuration"], 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /swh/indexer/storage/exc.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | 7 | class IndexerStorageAPIError(Exception): 8 | """Generic error of the indexer storage.""" 9 | 10 | pass 11 | 12 | 13 | class IndexerStorageArgumentException(Exception): 14 | """Argument passed to an IndexerStorage endpoint is invalid.""" 15 | 16 | pass 17 | 18 | 19 | class DuplicateId(IndexerStorageArgumentException): 20 | """The same identifier is present more than once.""" 21 | 22 | pass 23 | -------------------------------------------------------------------------------- /swh/indexer/storage/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2019-2020 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from functools import wraps 7 | import logging 8 | 9 | from swh.core.statsd import statsd 10 | 11 | OPERATIONS_METRIC = "swh_indexer_storage_operations_total" 12 | OPERATIONS_UNIT_METRIC = "swh_indexer_storage_operations_{unit}_total" 13 | DURATION_METRIC = "swh_indexer_storage_request_duration_seconds" 14 | 15 | 16 | def timed(f): 17 | """Time that function!""" 18 | 19 | @wraps(f) 20 | def d(*a, **kw): 21 | with statsd.timed(DURATION_METRIC, tags={"endpoint": f.__name__}): 22 | return f(*a, **kw) 23 | 24 | return d 25 | 26 | 27 | def send_metric(metric, count, method_name): 28 | """Send statsd metric with count for method `method_name` 29 | 30 | If count is 0, the metric is discarded. If the metric is not 31 | parseable, the metric is discarded with a log message. 32 | 33 | Args: 34 | metric (str): Metric's name (e.g content:add, content:add:bytes) 35 | count (int): Associated value for the metric 36 | method_name (str): Method's name 37 | 38 | Returns: 39 | Bool to explicit if metric has been set or not 40 | """ 41 | if count == 0: 42 | return False 43 | 44 | metric_type = metric.split(":") 45 | _length = len(metric_type) 46 | if _length == 2: 47 | object_type, operation = metric_type 48 | metric_name = OPERATIONS_METRIC 49 | elif _length == 3: 50 | object_type, operation, unit = metric_type 51 | metric_name = OPERATIONS_UNIT_METRIC.format(unit=unit) 52 | else: 53 | logging.warning("Skipping unknown metric {%s: %s}" % (metric, count)) 54 | return False 55 | 56 | statsd.increment( 57 | metric_name, 58 | count, 59 | tags={ 60 | "endpoint": method_name, 61 | "object_type": object_type, 62 | "operation": operation, 63 | }, 64 | ) 65 | return True 66 | 67 | 68 | def process_metrics(f): 69 | """Increment object counters for the decorated function.""" 70 | 71 | @wraps(f) 72 | def d(*a, **kw): 73 | r = f(*a, **kw) 74 | for metric, count in r.items(): 75 | send_metric(metric=metric, count=count, method_name=f.__name__) 76 | 77 | return r 78 | 79 | return d 80 | -------------------------------------------------------------------------------- /swh/indexer/storage/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | """Classes used internally by the in-memory idx-storage, and will be 7 | used for the interface of the idx-storage in the near future.""" 8 | 9 | from __future__ import annotations 10 | 11 | import json 12 | from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar 13 | 14 | import attr 15 | from typing_extensions import Final 16 | 17 | from swh.model.model import Sha1Git 18 | 19 | TSelf = TypeVar("TSelf") 20 | 21 | 22 | @attr.s 23 | class BaseRow: 24 | UNIQUE_KEY_FIELDS: Tuple = ("id",) 25 | 26 | id = attr.ib(type=Any) 27 | indexer_configuration_id = attr.ib(type=Optional[int], default=None, kw_only=True) 28 | tool = attr.ib(type=Optional[Dict], default=None, kw_only=True) 29 | 30 | def __attrs_post_init__(self): 31 | if self.indexer_configuration_id is None and self.tool is None: 32 | raise TypeError("Either indexer_configuration_id or tool must be not None.") 33 | if self.indexer_configuration_id is not None and self.tool is not None: 34 | raise TypeError( 35 | "indexer_configuration_id and tool are mutually exclusive; " 36 | "only one may be not None." 37 | ) 38 | 39 | def anonymize(self: TSelf) -> Optional[TSelf]: 40 | # Needed to implement swh.journal.writer.ValueProtocol 41 | return None 42 | 43 | def to_dict(self) -> Dict[str, Any]: 44 | """Wrapper of `attr.asdict` that can be overridden by subclasses 45 | that have special handling of some of the fields.""" 46 | d = attr.asdict(self, recurse=False) 47 | 48 | if d["indexer_configuration_id"] is None: 49 | del d["indexer_configuration_id"] 50 | if d["tool"] is None: 51 | del d["tool"] 52 | 53 | return d 54 | 55 | @classmethod 56 | def from_dict(cls: Type[TSelf], d) -> TSelf: 57 | return cls(**d) 58 | 59 | def unique_key(self) -> Dict: 60 | if not self.tool: 61 | raise ValueError( 62 | f"Cannot compute unique_key of {self.__class__.__name__} with no tool " 63 | f"dictionary (indexer_configuration_id was given instead)" 64 | ) 65 | 66 | tool_dict = { 67 | "tool_name": self.tool["name"], 68 | "tool_version": self.tool["version"], 69 | "tool_configuration": json.dumps( 70 | self.tool["configuration"], sort_keys=True 71 | ), 72 | } 73 | 74 | return { 75 | **{key: getattr(self, key) for key in self.UNIQUE_KEY_FIELDS}, 76 | **tool_dict, 77 | } 78 | 79 | 80 | @attr.s 81 | class ContentMimetypeRow(BaseRow): 82 | object_type: Final = "content_mimetype" 83 | 84 | id = attr.ib(type=Sha1Git) 85 | mimetype = attr.ib(type=str) 86 | encoding = attr.ib(type=str) 87 | 88 | 89 | @attr.s 90 | class ContentLicenseRow(BaseRow): 91 | object_type: Final = "content_fossology_license" 92 | UNIQUE_KEY_FIELDS = ("id", "license") 93 | 94 | id = attr.ib(type=Sha1Git) 95 | license = attr.ib(type=str) 96 | 97 | 98 | @attr.s 99 | class ContentMetadataRow(BaseRow): 100 | object_type: Final = "content_metadata" 101 | 102 | id = attr.ib(type=Sha1Git) 103 | metadata = attr.ib(type=Dict[str, Any]) 104 | 105 | 106 | @attr.s 107 | class DirectoryIntrinsicMetadataRow(BaseRow): 108 | object_type: Final = "directory_intrinsic_metadata" 109 | 110 | id = attr.ib(type=Sha1Git) 111 | metadata = attr.ib(type=Dict[str, Any]) 112 | mappings = attr.ib(type=List[str]) 113 | 114 | 115 | @attr.s 116 | class OriginIntrinsicMetadataRow(BaseRow): 117 | object_type: Final = "origin_intrinsic_metadata" 118 | 119 | id = attr.ib(type=str) 120 | metadata = attr.ib(type=Dict[str, Any]) 121 | from_directory = attr.ib(type=Sha1Git) 122 | mappings = attr.ib(type=List[str]) 123 | 124 | 125 | @attr.s 126 | class OriginExtrinsicMetadataRow(BaseRow): 127 | object_type: Final = "origin_extrinsic_metadata" 128 | 129 | id = attr.ib(type=str) 130 | """origin URL""" 131 | metadata = attr.ib(type=Dict[str, Any]) 132 | from_remd_id = attr.ib(type=Sha1Git) 133 | """id of the RawExtrinsicMetadata object used as source for indexed metadata""" 134 | mappings = attr.ib(type=List[str]) 135 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/10-superuser-init.sql: -------------------------------------------------------------------------------- 1 | create extension if not exists btree_gist; 2 | create extension if not exists pgcrypto; 3 | 4 | create or replace language plpgsql; 5 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/20-enums.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/storage/sql/20-enums.sql -------------------------------------------------------------------------------- /swh/indexer/storage/sql/50-data.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/storage/sql/50-data.sql -------------------------------------------------------------------------------- /swh/indexer/storage/sql/60-indexes.sql: -------------------------------------------------------------------------------- 1 | -- fossology_license 2 | create unique index fossology_license_pkey on fossology_license(id); 3 | alter table fossology_license add primary key using index fossology_license_pkey; 4 | 5 | create unique index on fossology_license(name); 6 | 7 | -- indexer_configuration 8 | create unique index concurrently indexer_configuration_pkey on indexer_configuration(id); 9 | alter table indexer_configuration add primary key using index indexer_configuration_pkey; 10 | 11 | create unique index on indexer_configuration(tool_name, tool_version, tool_configuration); 12 | 13 | -- content_metadata 14 | create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id); 15 | alter table content_metadata add primary key using index content_metadata_pkey; 16 | 17 | alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 18 | alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; 19 | 20 | -- directory_intrinsic_metadata 21 | create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id); 22 | alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey; 23 | 24 | alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 25 | alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey; 26 | 27 | -- content_mimetype 28 | create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); 29 | alter table content_mimetype add primary key using index content_mimetype_pkey; 30 | 31 | alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 32 | alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey; 33 | 34 | create index on content_mimetype(id) where mimetype like 'text/%'; 35 | 36 | -- content_fossology_license 37 | create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id); 38 | alter table content_fossology_license add primary key using index content_fossology_license_pkey; 39 | 40 | alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid; 41 | alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey; 42 | 43 | alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 44 | alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; 45 | 46 | -- origin_intrinsic_metadata 47 | create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id); 48 | alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; 49 | 50 | alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 51 | alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; 52 | 53 | create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); 54 | create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); 55 | 56 | -- origin_extrinsic_metadata 57 | create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id); 58 | alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey; 59 | 60 | alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 61 | alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey; 62 | 63 | create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector); 64 | create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings); 65 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/115.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 114 3 | -- to_version: 115 4 | -- description: Remove temporary table use in reading api 5 | 6 | insert into dbversion(version, release, description) 7 | values(115, now(), 'Work In Progress'); 8 | 9 | drop function swh_mktemp_content_mimetype_missing(); 10 | drop function swh_content_mimetype_missing(); 11 | 12 | drop function swh_content_mimetype_get(); 13 | drop type content_mimetype_signature; 14 | 15 | drop function swh_mktemp_content_language_missing(); 16 | drop function swh_content_language_missing(); 17 | 18 | drop function swh_content_language_get(); 19 | drop type content_language_signature; 20 | 21 | drop function swh_mktemp_content_ctags_missing(); 22 | drop function swh_content_ctags_missing(); 23 | 24 | drop function swh_content_ctags_get(); 25 | --drop type content_ctags_signature; -- still used in swh_content_ctags_search 26 | 27 | drop function swh_content_fossology_license_get(); 28 | drop type content_fossology_license_signature; 29 | 30 | drop function swh_mktemp_content_metadata_missing(); 31 | drop function swh_content_metadata_missing(); 32 | 33 | drop function swh_content_metadata_get(); 34 | drop type content_metadata_signature; 35 | 36 | drop function swh_mktemp_revision_metadata_missing(); 37 | drop function swh_revision_metadata_missing(); 38 | 39 | drop function swh_revision_metadata_get(); 40 | drop type revision_metadata_signature; 41 | 42 | drop function swh_mktemp_bytea(); 43 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/116.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 115 3 | -- to_version: 116 4 | -- description: 5 | 6 | insert into dbversion(version, release, description) 7 | values(116, now(), 'Work In Progress'); 8 | 9 | drop table origin_metadata_translation; 10 | 11 | create table origin_intrinsic_metadata( 12 | origin_id bigserial not null, 13 | metadata jsonb, 14 | indexer_configuration_id bigint not null, 15 | from_revision sha1_git not null 16 | ); 17 | 18 | comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; 19 | comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin'; 20 | comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; 21 | comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; 22 | comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; 23 | 24 | -- create a temporary table for retrieving origin_intrinsic_metadata 25 | create or replace function swh_mktemp_origin_intrinsic_metadata() 26 | returns void 27 | language sql 28 | as $$ 29 | create temporary table tmp_origin_intrinsic_metadata ( 30 | like origin_intrinsic_metadata including defaults 31 | ) on commit drop; 32 | $$; 33 | 34 | comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata'; 35 | 36 | 37 | -- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata, 38 | -- overwriting duplicates if conflict_update is true, skipping duplicates 39 | -- otherwise. 40 | -- 41 | -- If filtering duplicates is in order, the call to 42 | -- swh_origin_intrinsic_metadata_missing must take place before calling this 43 | -- function. 44 | -- 45 | -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to 46 | -- tmp_origin_intrinsic_metadata, 2. call this function 47 | create or replace function swh_origin_intrinsic_metadata_add( 48 | conflict_update boolean) 49 | returns void 50 | language plpgsql 51 | as $$ 52 | begin 53 | if conflict_update then 54 | insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) 55 | select origin_id, metadata, indexer_configuration_id, from_revision 56 | from tmp_origin_intrinsic_metadata 57 | on conflict(origin_id, indexer_configuration_id) 58 | do update set metadata = excluded.metadata; 59 | 60 | else 61 | insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) 62 | select origin_id, metadata, indexer_configuration_id, from_revision 63 | from tmp_origin_intrinsic_metadata 64 | on conflict(origin_id, indexer_configuration_id) 65 | do nothing; 66 | end if; 67 | return; 68 | end 69 | $$; 70 | 71 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; 72 | 73 | 74 | -- origin_intrinsic_metadata 75 | create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(origin_id, indexer_configuration_id); 76 | alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; 77 | 78 | alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 79 | alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; 80 | alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; 81 | alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; 82 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/117.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 116 3 | -- to_version: 117 4 | -- description: Add fulltext search index for origin intrinsic metadata 5 | 6 | insert into dbversion(version, release, description) 7 | values(117, now(), 'Work In Progress'); 8 | 9 | alter table origin_intrinsic_metadata add column metadata_tsvector tsvector; 10 | update origin_intrinsic_metadata set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); 11 | create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); 12 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/118.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 117 3 | -- to_version: 118 4 | -- description: content_mimetype: Migrate bytes column to text 5 | 6 | insert into dbversion(version, release, description) 7 | values(118, now(), 'Work In Progress'); 8 | 9 | alter table content_mimetype 10 | alter column mimetype set data type text 11 | using convert_from(mimetype, 'utf-8'), 12 | alter column encoding set data type text 13 | using convert_from(encoding, 'utf-8'); 14 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/119.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 118 3 | -- to_version: 119 4 | -- description: metadata tables: add 'mappings' column 5 | 6 | insert into dbversion(version, release, description) 7 | values(119, now(), 'Work In Progress'); 8 | 9 | alter table revision_metadata 10 | add column mappings text array not null default '{}'; 11 | alter table revision_metadata 12 | alter column mappings 13 | drop default; 14 | 15 | alter table origin_intrinsic_metadata 16 | add column mappings text array not null default '{}'; 17 | alter table origin_intrinsic_metadata 18 | alter column mappings 19 | drop default; 20 | 21 | 22 | create or replace function swh_revision_metadata_add(conflict_update boolean) 23 | returns void 24 | language plpgsql 25 | as $$ 26 | begin 27 | if conflict_update then 28 | insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) 29 | select id, translated_metadata, mappings, indexer_configuration_id 30 | from tmp_revision_metadata tcm 31 | on conflict(id, indexer_configuration_id) 32 | do update set translated_metadata = excluded.translated_metadata; 33 | 34 | else 35 | insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) 36 | select id, translated_metadata, mappings, indexer_configuration_id 37 | from tmp_revision_metadata tcm 38 | on conflict(id, indexer_configuration_id) 39 | do nothing; 40 | end if; 41 | return; 42 | end 43 | $$; 44 | 45 | 46 | create or replace function swh_origin_intrinsic_metadata_add( 47 | conflict_update boolean) 48 | returns void 49 | language plpgsql 50 | as $$ 51 | begin 52 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 53 | if conflict_update then 54 | insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 55 | select origin_id, metadata, indexer_configuration_id, from_revision, 56 | metadata_tsvector, mappings 57 | from tmp_origin_intrinsic_metadata 58 | on conflict(origin_id, indexer_configuration_id) 59 | do update set metadata = excluded.metadata; 60 | 61 | else 62 | insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 63 | select origin_id, metadata, indexer_configuration_id, from_revision, 64 | metadata_tsvector, mappings 65 | from tmp_origin_intrinsic_metadata 66 | on conflict(origin_id, indexer_configuration_id) 67 | do nothing; 68 | end if; 69 | return; 70 | end 71 | $$; 72 | 73 | 74 | -- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata. 75 | -- 76 | -- It uses the "pg_catalog.simple" dictionary, as it has no stopword, 77 | -- so it should be suitable for proper names and non-English text. 78 | create or replace function swh_origin_intrinsic_metadata_compute_tsvector() 79 | returns void 80 | language plpgsql 81 | as $$ 82 | begin 83 | update tmp_origin_intrinsic_metadata 84 | set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); 85 | end 86 | $$; 87 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/120.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 119 3 | -- to_version: 120 4 | -- description: fix updates of the 'mappings' column in metadata tables 5 | 6 | insert into dbversion(version, release, description) 7 | values(120, now(), 'Work In Progress'); 8 | 9 | create or replace function swh_revision_metadata_add(conflict_update boolean) 10 | returns void 11 | language plpgsql 12 | as $$ 13 | begin 14 | if conflict_update then 15 | insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) 16 | select id, translated_metadata, mappings, indexer_configuration_id 17 | from tmp_revision_metadata tcm 18 | on conflict(id, indexer_configuration_id) 19 | do update set 20 | translated_metadata = excluded.translated_metadata, 21 | mappings = excluded.mappings; 22 | 23 | else 24 | insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) 25 | select id, translated_metadata, mappings, indexer_configuration_id 26 | from tmp_revision_metadata tcm 27 | on conflict(id, indexer_configuration_id) 28 | do nothing; 29 | end if; 30 | return; 31 | end 32 | $$; 33 | 34 | create or replace function swh_origin_intrinsic_metadata_add( 35 | conflict_update boolean) 36 | returns void 37 | language plpgsql 38 | as $$ 39 | begin 40 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 41 | if conflict_update then 42 | insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 43 | select origin_id, metadata, indexer_configuration_id, from_revision, 44 | metadata_tsvector, mappings 45 | from tmp_origin_intrinsic_metadata 46 | on conflict(origin_id, indexer_configuration_id) 47 | do update set 48 | metadata = excluded.metadata, 49 | mappings = excluded.mappings; 50 | 51 | else 52 | insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 53 | select origin_id, metadata, indexer_configuration_id, from_revision, 54 | metadata_tsvector, mappings 55 | from tmp_origin_intrinsic_metadata 56 | on conflict(origin_id, indexer_configuration_id) 57 | do nothing; 58 | end if; 59 | return; 60 | end 61 | $$; 62 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/121.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 120 3 | -- to_version: 121 4 | -- description: add comment to the 'mappings' column 5 | 6 | insert into dbversion(version, release, description) 7 | values(121, now(), 'Work In Progress'); 8 | 9 | comment on column revision_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; 10 | comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; 11 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/122.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 121 3 | -- to_version: 122 4 | -- description: add index to search origin_intrinsic_metadata for mappings. 5 | 6 | insert into dbversion(version, release, description) 7 | values(122, now(), 'Work In Progress'); 8 | 9 | create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); 10 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/123.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 122 3 | -- to_version: 123 4 | -- description: fix heterogeneity of names in metadata tables 5 | 6 | insert into dbversion(version, release, description) 7 | values(123, now(), 'Work In Progress'); 8 | 9 | create or replace function swh_content_metadata_add(conflict_update boolean) 10 | returns void 11 | language plpgsql 12 | as $$ 13 | begin 14 | if conflict_update then 15 | insert into content_metadata (id, metadata, indexer_configuration_id) 16 | select id, metadata, indexer_configuration_id 17 | from tmp_content_metadata tcm 18 | on conflict(id, indexer_configuration_id) 19 | do update set metadata = excluded.metadata; 20 | 21 | else 22 | insert into content_metadata (id, metadata, indexer_configuration_id) 23 | select id, metadata, indexer_configuration_id 24 | from tmp_content_metadata tcm 25 | on conflict(id, indexer_configuration_id) 26 | do nothing; 27 | end if; 28 | return; 29 | end 30 | $$; 31 | 32 | alter function swh_revision_metadata_add rename to swh_revision_intrinsic_metadata_add; 33 | create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean) 34 | returns void 35 | language plpgsql 36 | as $$ 37 | begin 38 | if conflict_update then 39 | insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) 40 | select id, metadata, mappings, indexer_configuration_id 41 | from tmp_revision_intrinsic_metadata tcm 42 | on conflict(id, indexer_configuration_id) 43 | do update set 44 | metadata = excluded.metadata, 45 | mappings = excluded.mappings; 46 | 47 | else 48 | insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) 49 | select id, metadata, mappings, indexer_configuration_id 50 | from tmp_revision_intrinsic_metadata tcm 51 | on conflict(id, indexer_configuration_id) 52 | do nothing; 53 | end if; 54 | return; 55 | end 56 | $$; 57 | 58 | alter function swh_mktemp_revision_metadata rename to swh_mktemp_revision_intrinsic_metadata; 59 | create or replace function swh_mktemp_revision_intrinsic_metadata() 60 | returns void 61 | language sql 62 | as $$ 63 | create temporary table tmp_revision_intrinsic_metadata ( 64 | like revision_intrinsic_metadata including defaults 65 | ) on commit drop; 66 | $$; 67 | 68 | create or replace function swh_origin_intrinsic_metadata_add( 69 | conflict_update boolean) 70 | returns void 71 | language plpgsql 72 | as $$ 73 | begin 74 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 75 | if conflict_update then 76 | insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 77 | select id, metadata, indexer_configuration_id, from_revision, 78 | metadata_tsvector, mappings 79 | from tmp_origin_intrinsic_metadata 80 | on conflict(id, indexer_configuration_id) 81 | do update set 82 | metadata = excluded.metadata, 83 | mappings = excluded.mappings; 84 | 85 | else 86 | insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 87 | select id, metadata, indexer_configuration_id, from_revision, 88 | metadata_tsvector, mappings 89 | from tmp_origin_intrinsic_metadata 90 | on conflict(id, indexer_configuration_id) 91 | do nothing; 92 | end if; 93 | return; 94 | end 95 | $$; 96 | 97 | alter index revision_metadata_pkey rename to revision_intrinsic_metadata_pkey; 98 | 99 | alter table revision_metadata rename column translated_metadata to metadata; 100 | alter table content_metadata rename column translated_metadata to metadata; 101 | alter table origin_intrinsic_metadata rename column origin_id to id; 102 | 103 | alter table revision_metadata rename to revision_intrinsic_metadata; 104 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/124.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 123 3 | -- to_version: 124 4 | -- description: drop constraint that origin_intrinsic_metadata references an existing revision_intrinsic_metadata. 5 | 6 | insert into dbversion(version, release, description) 7 | values(124, now(), 'Work In Progress'); 8 | 9 | alter table origin_intrinsic_metadata drop constraint origin_intrinsic_metadata_revision_metadata_fkey; 10 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/125.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 124 3 | -- to_version: 125 4 | -- description: Add 'origin_url' column to origin_intrinsic_metadata. 5 | 6 | insert into dbversion(version, release, description) 7 | values(125, now(), 'Work In Progress'); 8 | 9 | alter table origin_intrinsic_metadata 10 | add column origin_url text; 11 | 12 | create or replace function swh_origin_intrinsic_metadata_add( 13 | conflict_update boolean) 14 | returns void 15 | language plpgsql 16 | as $$ 17 | begin 18 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 19 | if conflict_update then 20 | insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 21 | select id, origin_url, metadata, indexer_configuration_id, from_revision, 22 | metadata_tsvector, mappings 23 | from tmp_origin_intrinsic_metadata 24 | on conflict(id, indexer_configuration_id) 25 | do update set 26 | metadata = excluded.metadata, 27 | mappings = excluded.mappings; 28 | 29 | else 30 | insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 31 | select id, origin_url, metadata, indexer_configuration_id, from_revision, 32 | metadata_tsvector, mappings 33 | from tmp_origin_intrinsic_metadata 34 | on conflict(id, indexer_configuration_id) 35 | do nothing; 36 | end if; 37 | return; 38 | end 39 | $$; 40 | 41 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; 42 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/126.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 125 3 | -- to_version: 126 4 | -- description: Make swh_origin_intrinsic_metadata_add update all fields 5 | 6 | insert into dbversion(version, release, description) 7 | values(126, now(), 'Work In Progress'); 8 | 9 | 10 | create or replace function swh_origin_intrinsic_metadata_add( 11 | conflict_update boolean) 12 | returns void 13 | language plpgsql 14 | as $$ 15 | begin 16 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 17 | if conflict_update then 18 | insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 19 | select id, origin_url, metadata, indexer_configuration_id, from_revision, 20 | metadata_tsvector, mappings 21 | from tmp_origin_intrinsic_metadata 22 | on conflict(id, indexer_configuration_id) 23 | do update set 24 | metadata = excluded.metadata, 25 | metadata_tsvector = excluded.metadata_tsvector, 26 | mappings = excluded.mappings, 27 | origin_url = excluded.origin_url, 28 | from_revision = excluded.from_revision; 29 | 30 | else 31 | insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 32 | select id, origin_url, metadata, indexer_configuration_id, from_revision, 33 | metadata_tsvector, mappings 34 | from tmp_origin_intrinsic_metadata 35 | on conflict(id, indexer_configuration_id) 36 | do nothing; 37 | end if; 38 | return; 39 | end 40 | $$; 41 | 42 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; 43 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/127.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 126 3 | -- to_version: 127 4 | -- description: Remove swh_origin_intrinsic_metadata_add origin_url field and 5 | -- replace id by the former content of origin_url 6 | 7 | insert into dbversion(version, release, description) 8 | values(127, now(), 'Work In Progress'); 9 | 10 | -- replace id column by origin_url 11 | alter table origin_intrinsic_metadata 12 | drop constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; 13 | alter table origin_intrinsic_metadata 14 | drop constraint origin_intrinsic_metadata_pkey; 15 | alter table origin_intrinsic_metadata 16 | drop column id; 17 | alter table origin_intrinsic_metadata 18 | rename column origin_url to id; 19 | comment on column origin_intrinsic_metadata.id is 'url of the origin'; 20 | 21 | -- replace functions that operate on this table 22 | create or replace function swh_origin_intrinsic_metadata_add( 23 | conflict_update boolean) 24 | returns void 25 | language plpgsql 26 | as $$ 27 | begin 28 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 29 | if conflict_update then 30 | insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 31 | select id, metadata, indexer_configuration_id, from_revision, 32 | metadata_tsvector, mappings 33 | from tmp_origin_intrinsic_metadata 34 | on conflict(id, indexer_configuration_id) 35 | do update set 36 | metadata = excluded.metadata, 37 | metadata_tsvector = excluded.metadata_tsvector, 38 | mappings = excluded.mappings, 39 | from_revision = excluded.from_revision; 40 | 41 | else 42 | insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 43 | select id, metadata, indexer_configuration_id, from_revision, 44 | metadata_tsvector, mappings 45 | from tmp_origin_intrinsic_metadata 46 | on conflict(id, indexer_configuration_id) 47 | do nothing; 48 | end if; 49 | return; 50 | end 51 | $$; 52 | comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; 53 | 54 | -- recreate indexes/constraints on this table 55 | create unique index origin_intrinsic_metadata_pkey 56 | on origin_intrinsic_metadata(id, indexer_configuration_id); 57 | alter table origin_intrinsic_metadata 58 | add primary key using index origin_intrinsic_metadata_pkey; 59 | 60 | alter table origin_intrinsic_metadata 61 | add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 62 | alter table origin_intrinsic_metadata 63 | validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; 64 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/128.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 127 3 | -- to_version: 128 4 | -- description: Add index on content_mimetype table to improve read queries 5 | 6 | insert into dbversion(version, release, description) 7 | values(128, now(), 'Work In Progress'); 8 | 9 | create index on content_mimetype(id) where mimetype like 'text/%'; 10 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/129.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 128 3 | -- to_version: 129 4 | -- description: 5 | 6 | insert into dbversion(version, release, description) 7 | values(129, now(), 'Work In Progress'); 8 | 9 | create or replace function swh_mktemp(tblname regclass) 10 | returns void 11 | language plpgsql 12 | as $$ 13 | begin 14 | execute format(' 15 | create temporary table if not exists tmp_%1$I 16 | (like %1$I including defaults) 17 | on commit delete rows; 18 | alter table tmp_%1$I drop column if exists object_id; 19 | ', tblname); 20 | return; 21 | end 22 | $$; 23 | 24 | -- create a temporary table for content_mimetype tmp_content_mimetype, 25 | create or replace function swh_mktemp_content_mimetype() 26 | returns void 27 | language sql 28 | as $$ 29 | create temporary table if not exists tmp_content_mimetype ( 30 | like content_mimetype including defaults 31 | ) on commit delete rows; 32 | $$; 33 | 34 | -- create a temporary table for retrieving content_language 35 | create or replace function swh_mktemp_content_language() 36 | returns void 37 | language sql 38 | as $$ 39 | create temporary table if not exists tmp_content_language ( 40 | like content_language including defaults 41 | ) on commit delete rows; 42 | $$; 43 | 44 | comment on function swh_mktemp_content_language() is 'Helper table to add content language'; 45 | 46 | 47 | -- create a temporary table for content_ctags tmp_content_ctags, 48 | create or replace function swh_mktemp_content_ctags() 49 | returns void 50 | language sql 51 | as $$ 52 | create temporary table if not exists tmp_content_ctags ( 53 | like content_ctags including defaults 54 | ) on commit delete rows; 55 | $$; 56 | 57 | comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; 58 | 59 | -- create a temporary table for content_fossology_license tmp_content_fossology_license, 60 | create or replace function swh_mktemp_content_fossology_license() 61 | returns void 62 | language sql 63 | as $$ 64 | create temporary table if not exists tmp_content_fossology_license ( 65 | id sha1, 66 | license text, 67 | indexer_configuration_id integer 68 | ) on commit delete rows; 69 | $$; 70 | 71 | comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license'; 72 | 73 | 74 | -- create a temporary table for retrieving content_metadata 75 | create or replace function swh_mktemp_content_metadata() 76 | returns void 77 | language sql 78 | as $$ 79 | create temporary table if not exists tmp_content_metadata ( 80 | like content_metadata including defaults 81 | ) on commit delete rows; 82 | $$; 83 | 84 | comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; 85 | 86 | 87 | -- create a temporary table for retrieving revision_intrinsic_metadata 88 | create or replace function swh_mktemp_revision_intrinsic_metadata() 89 | returns void 90 | language sql 91 | as $$ 92 | create temporary table if not exists tmp_revision_intrinsic_metadata ( 93 | like revision_intrinsic_metadata including defaults 94 | ) on commit delete rows; 95 | $$; 96 | 97 | comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata'; 98 | 99 | -- create a temporary table for retrieving origin_intrinsic_metadata 100 | create or replace function swh_mktemp_origin_intrinsic_metadata() 101 | returns void 102 | language sql 103 | as $$ 104 | create temporary table if not exists tmp_origin_intrinsic_metadata ( 105 | like origin_intrinsic_metadata including defaults 106 | ) on commit delete rows; 107 | $$; 108 | 109 | comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata'; 110 | 111 | create or replace function swh_mktemp_indexer_configuration() 112 | returns void 113 | language sql 114 | as $$ 115 | create temporary table if not exists tmp_indexer_configuration ( 116 | like indexer_configuration including defaults 117 | ) on commit delete rows; 118 | alter table tmp_indexer_configuration drop column if exists id; 119 | $$; 120 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/130.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 129 3 | -- to_version: 130 4 | -- description: 5 | 6 | insert into dbversion(version, release, description) 7 | values(130, now(), 'Work In Progress'); 8 | 9 | create or replace function swh_content_fossology_license_add(conflict_update boolean) 10 | returns void 11 | language plpgsql 12 | as $$ 13 | begin 14 | -- insert unknown licenses first 15 | insert into fossology_license (name) 16 | select distinct license from tmp_content_fossology_license tmp 17 | where not exists (select 1 from fossology_license where name=tmp.license) 18 | on conflict(name) do nothing; 19 | 20 | if conflict_update then 21 | insert into content_fossology_license (id, license_id, indexer_configuration_id) 22 | select tcl.id, 23 | (select id from fossology_license where name = tcl.license) as license, 24 | indexer_configuration_id 25 | from tmp_content_fossology_license tcl 26 | on conflict(id, license_id, indexer_configuration_id) 27 | do update set license_id = excluded.license_id; 28 | return; 29 | end if; 30 | 31 | insert into content_fossology_license (id, license_id, indexer_configuration_id) 32 | select tcl.id, 33 | (select id from fossology_license where name = tcl.license) as license, 34 | indexer_configuration_id 35 | from tmp_content_fossology_license tcl 36 | on conflict(id, license_id, indexer_configuration_id) 37 | do nothing; 38 | return; 39 | end 40 | $$; 41 | 42 | comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; 43 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/132.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 131 3 | -- to_version: 132 4 | -- description: _add function returns the inserted rows 5 | 6 | insert into dbversion(version, release, description) 7 | values(132, now(), 'Work In Progress'); 8 | 9 | create or replace function swh_content_fossology_license_add(conflict_update boolean) 10 | returns bigint 11 | language plpgsql 12 | as $$ 13 | declare 14 | res bigint; 15 | begin 16 | -- insert unknown licenses first 17 | insert into fossology_license (name) 18 | select distinct license from tmp_content_fossology_license tmp 19 | where not exists (select 1 from fossology_license where name=tmp.license) 20 | on conflict(name) do nothing; 21 | 22 | if conflict_update then 23 | insert into content_fossology_license (id, license_id, indexer_configuration_id) 24 | select tcl.id, 25 | (select id from fossology_license where name = tcl.license) as license, 26 | indexer_configuration_id 27 | from tmp_content_fossology_license tcl 28 | on conflict(id, license_id, indexer_configuration_id) 29 | do update set license_id = excluded.license_id; 30 | else 31 | insert into content_fossology_license (id, license_id, indexer_configuration_id) 32 | select tcl.id, 33 | (select id from fossology_license where name = tcl.license) as license, 34 | indexer_configuration_id 35 | from tmp_content_fossology_license tcl 36 | on conflict(id, license_id, indexer_configuration_id) 37 | do nothing; 38 | end if; 39 | 40 | get diagnostics res = ROW_COUNT; 41 | return res; 42 | end 43 | $$; 44 | 45 | comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; 46 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/133.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 132 3 | -- to_version: 133 4 | -- description: remove 'conflict_update' argument 5 | 6 | insert into dbversion(version, release, description) 7 | values(133, now(), 'Work In Progress'); 8 | 9 | drop function swh_content_mimetype_add(conflict_update boolean); 10 | create or replace function swh_content_mimetype_add() 11 | returns bigint 12 | language plpgsql 13 | as $$ 14 | declare 15 | res bigint; 16 | begin 17 | insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) 18 | select id, mimetype, encoding, indexer_configuration_id 19 | from tmp_content_mimetype tcm 20 | on conflict(id, indexer_configuration_id) 21 | do update set mimetype = excluded.mimetype, 22 | encoding = excluded.encoding; 23 | 24 | get diagnostics res = ROW_COUNT; 25 | return res; 26 | end 27 | $$; 28 | 29 | comment on function swh_content_mimetype_add() IS 'Add new content mimetypes'; 30 | 31 | 32 | 33 | drop function swh_content_language_add(conflict_update boolean); 34 | create or replace function swh_content_language_add() 35 | returns bigint 36 | language plpgsql 37 | as $$ 38 | declare 39 | res bigint; 40 | begin 41 | insert into content_language (id, lang, indexer_configuration_id) 42 | select id, lang, indexer_configuration_id 43 | from tmp_content_language tcl 44 | on conflict(id, indexer_configuration_id) 45 | do update set lang = excluded.lang; 46 | 47 | get diagnostics res = ROW_COUNT; 48 | return res; 49 | end 50 | $$; 51 | 52 | comment on function swh_content_language_add() IS 'Add new content languages'; 53 | 54 | 55 | 56 | drop function swh_content_ctags_add(conflict_update boolean); 57 | create or replace function swh_content_ctags_add() 58 | returns bigint 59 | language plpgsql 60 | as $$ 61 | declare 62 | res bigint; 63 | begin 64 | insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) 65 | select id, name, kind, line, lang, indexer_configuration_id 66 | from tmp_content_ctags tct 67 | on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) 68 | do nothing; 69 | 70 | get diagnostics res = ROW_COUNT; 71 | return res; 72 | end 73 | $$; 74 | 75 | comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content'; 76 | 77 | 78 | 79 | drop function swh_content_fossology_license_add(conflict_update boolean); 80 | create or replace function swh_content_fossology_license_add() 81 | returns bigint 82 | language plpgsql 83 | as $$ 84 | declare 85 | res bigint; 86 | begin 87 | -- insert unknown licenses first 88 | insert into fossology_license (name) 89 | select distinct license from tmp_content_fossology_license tmp 90 | where not exists (select 1 from fossology_license where name=tmp.license) 91 | on conflict(name) do nothing; 92 | 93 | insert into content_fossology_license (id, license_id, indexer_configuration_id) 94 | select tcl.id, 95 | (select id from fossology_license where name = tcl.license) as license, 96 | indexer_configuration_id 97 | from tmp_content_fossology_license tcl 98 | on conflict(id, license_id, indexer_configuration_id) 99 | do update set license_id = excluded.license_id; 100 | 101 | get diagnostics res = ROW_COUNT; 102 | return res; 103 | end 104 | $$; 105 | 106 | comment on function swh_content_fossology_license_add() IS 'Add new content licenses'; 107 | 108 | 109 | 110 | drop function swh_content_metadata_add(conflict_update boolean); 111 | create or replace function swh_content_metadata_add() 112 | returns bigint 113 | language plpgsql 114 | as $$ 115 | declare 116 | res bigint; 117 | begin 118 | insert into content_metadata (id, metadata, indexer_configuration_id) 119 | select id, metadata, indexer_configuration_id 120 | from tmp_content_metadata tcm 121 | on conflict(id, indexer_configuration_id) 122 | do update set metadata = excluded.metadata; 123 | 124 | get diagnostics res = ROW_COUNT; 125 | return res; 126 | end 127 | $$; 128 | 129 | comment on function swh_content_metadata_add() IS 'Add new content metadata'; 130 | 131 | 132 | 133 | drop function swh_revision_intrinsic_metadata_add(conflict_update boolean); 134 | create or replace function swh_revision_intrinsic_metadata_add() 135 | returns bigint 136 | language plpgsql 137 | as $$ 138 | declare 139 | res bigint; 140 | begin 141 | insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) 142 | select id, metadata, mappings, indexer_configuration_id 143 | from tmp_revision_intrinsic_metadata tcm 144 | on conflict(id, indexer_configuration_id) 145 | do update set 146 | metadata = excluded.metadata, 147 | mappings = excluded.mappings; 148 | 149 | get diagnostics res = ROW_COUNT; 150 | return res; 151 | end 152 | $$; 153 | 154 | comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata'; 155 | 156 | 157 | 158 | drop function swh_origin_intrinsic_metadata_add(conflict_update boolean); 159 | create or replace function swh_origin_intrinsic_metadata_add() 160 | returns bigint 161 | language plpgsql 162 | as $$ 163 | declare 164 | res bigint; 165 | begin 166 | perform swh_origin_intrinsic_metadata_compute_tsvector(); 167 | 168 | insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) 169 | select id, metadata, indexer_configuration_id, from_revision, 170 | metadata_tsvector, mappings 171 | from tmp_origin_intrinsic_metadata 172 | on conflict(id, indexer_configuration_id) 173 | do update set 174 | metadata = excluded.metadata, 175 | metadata_tsvector = excluded.metadata_tsvector, 176 | mappings = excluded.mappings, 177 | from_revision = excluded.from_revision; 178 | 179 | get diagnostics res = ROW_COUNT; 180 | return res; 181 | end 182 | $$; 183 | 184 | comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata'; 185 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/135.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 134 3 | -- to_version: 135 4 | -- description: Add support for origin_extrinsic_metadata 5 | 6 | insert into dbversion(version, release, description) 7 | values(135, now(), 'Work In Progress'); 8 | 9 | create table origin_extrinsic_metadata( 10 | id text not null, -- origin url 11 | metadata jsonb, 12 | indexer_configuration_id bigint not null, 13 | from_remd_id sha1_git not null, 14 | metadata_tsvector tsvector, 15 | mappings text array not null 16 | ); 17 | 18 | comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin'; 19 | comment on column origin_extrinsic_metadata.id is 'url of the origin'; 20 | comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory'; 21 | comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; 22 | comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.'; 23 | comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)'; 24 | 25 | -- create a temporary table for retrieving origin_extrinsic_metadata 26 | create or replace function swh_mktemp_origin_extrinsic_metadata() 27 | returns void 28 | language sql 29 | as $$ 30 | create temporary table if not exists tmp_origin_extrinsic_metadata ( 31 | like origin_extrinsic_metadata including defaults 32 | ) on commit delete rows; 33 | $$; 34 | 35 | comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata'; 36 | 37 | create or replace function swh_mktemp_indexer_configuration() 38 | returns void 39 | language sql 40 | as $$ 41 | create temporary table if not exists tmp_indexer_configuration ( 42 | like indexer_configuration including defaults 43 | ) on commit delete rows; 44 | alter table tmp_indexer_configuration drop column if exists id; 45 | $$; 46 | 47 | -- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata, 48 | -- overwriting duplicates. 49 | -- 50 | -- If filtering duplicates is in order, the call to 51 | -- swh_origin_extrinsic_metadata_missing must take place before calling this 52 | -- function. 53 | -- 54 | -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to 55 | -- tmp_origin_extrinsic_metadata, 2. call this function 56 | create or replace function swh_origin_extrinsic_metadata_add() 57 | returns bigint 58 | language plpgsql 59 | as $$ 60 | declare 61 | res bigint; 62 | begin 63 | perform swh_origin_extrinsic_metadata_compute_tsvector(); 64 | 65 | insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings) 66 | select id, metadata, indexer_configuration_id, from_remd_id, 67 | metadata_tsvector, mappings 68 | from tmp_origin_extrinsic_metadata 69 | on conflict(id, indexer_configuration_id) 70 | do update set 71 | metadata = excluded.metadata, 72 | metadata_tsvector = excluded.metadata_tsvector, 73 | mappings = excluded.mappings, 74 | from_remd_id = excluded.from_remd_id; 75 | 76 | get diagnostics res = ROW_COUNT; 77 | return res; 78 | end 79 | $$; 80 | 81 | comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata'; 82 | 83 | 84 | -- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata. 85 | -- 86 | -- It uses the "pg_catalog.simple" dictionary, as it has no stopword, 87 | -- so it should be suitable for proper names and non-English text. 88 | create or replace function swh_origin_extrinsic_metadata_compute_tsvector() 89 | returns void 90 | language plpgsql 91 | as $$ 92 | begin 93 | update tmp_origin_extrinsic_metadata 94 | set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); 95 | end 96 | $$; 97 | 98 | -- origin_extrinsic_metadata 99 | create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id); 100 | alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey; 101 | 102 | alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; 103 | alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey; 104 | 105 | create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector); 106 | create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings); 107 | -------------------------------------------------------------------------------- /swh/indexer/storage/sql/upgrades/137.sql: -------------------------------------------------------------------------------- 1 | -- SWH Indexer DB schema upgrade 2 | -- from_version: 136 3 | -- to_version: 137 4 | -- description: Drop content_language and content_ctags tables and related functions 5 | 6 | drop function if exists swh_content_language_add; 7 | drop function if exists swh_mktemp_content_language(); 8 | drop function if exists swh_mktemp_content_ctags(); 9 | drop function if exists swh_content_ctags_add(); 10 | drop function if exists swh_content_ctags_search; 11 | 12 | drop type if exists content_ctags_signature; 13 | 14 | drop table if exists content_language; 15 | drop table if exists content_ctags; 16 | 17 | drop type if exists languages; 18 | drop type if exists ctags_languages; 19 | 20 | -------------------------------------------------------------------------------- /swh/indexer/storage/writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from typing import Any, Dict, Iterable, Optional 7 | 8 | try: 9 | from swh.journal.writer import JournalWriterInterface, get_journal_writer 10 | except ImportError: 11 | get_journal_writer = None # type: ignore 12 | # mypy limitation, see https://github.com/python/mypy/issues/1153 13 | 14 | from .model import BaseRow 15 | 16 | 17 | class JournalWriter: 18 | """Journal writer storage collaborator. It's in charge of adding objects to 19 | the journal. 20 | 21 | """ 22 | 23 | journal: Optional[JournalWriterInterface] 24 | 25 | def __init__(self, journal_writer: Dict[str, Any]): 26 | """ 27 | Args: 28 | journal_writer: configuration passed to 29 | `swh.journal.writer.get_journal_writer` 30 | """ 31 | if journal_writer: 32 | if get_journal_writer is None: 33 | raise EnvironmentError( 34 | "You need the swh.journal package to use the " 35 | "journal_writer feature" 36 | ) 37 | self.journal = get_journal_writer( 38 | **journal_writer, 39 | value_sanitizer=lambda object_type, value_dict: value_dict, 40 | ) 41 | else: 42 | self.journal = None 43 | 44 | def write_additions(self, obj_type, entries: Iterable[BaseRow]) -> None: 45 | if not self.journal: 46 | return 47 | 48 | translated = [] 49 | 50 | for entry in entries: 51 | assert entry.object_type == obj_type # type: ignore 52 | 53 | # ids are internal to the database and should not be sent to postgresql 54 | if entry.indexer_configuration_id is not None: 55 | raise ValueError( 56 | f"{entry} passed to JournalWriter.write_additions has " 57 | f"indexer_configuration_id instead of full tool dict" 58 | ) 59 | assert entry.tool, "Missing both indexer_configuration_id and tool dict" 60 | if "id" in entry.tool: 61 | raise ValueError( 62 | f"{entry} passed to JournalWriter.write_additions " 63 | f"contains a tool id" 64 | ) 65 | 66 | translated.append(entry) 67 | 68 | # write to kafka 69 | self.journal.write_additions(obj_type, translated) 70 | -------------------------------------------------------------------------------- /swh/indexer/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | import swh.indexer 4 | 5 | __all__ = ["start_worker_thread"] 6 | 7 | SQL_DIR = path.join(path.dirname(swh.indexer.__file__), "sql") 8 | -------------------------------------------------------------------------------- /swh/indexer/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2019-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from functools import partial 7 | import os 8 | from unittest.mock import patch 9 | 10 | import pytest 11 | from pytest_postgresql import factories 12 | import yaml 13 | 14 | from swh.core.db.db_utils import initialize_database_for_module 15 | from swh.indexer.storage import IndexerStorage, get_indexer_storage 16 | from swh.objstorage.factory import get_objstorage 17 | from swh.storage import get_storage 18 | 19 | from .utils import fill_obj_storage, fill_storage 20 | 21 | idx_postgresql_proc = factories.postgresql_proc( 22 | load=[ 23 | partial( 24 | initialize_database_for_module, 25 | modname="indexer.storage", 26 | version=IndexerStorage.current_version, 27 | ) 28 | ], 29 | ) 30 | 31 | idx_storage_postgresql = factories.postgresql("idx_postgresql_proc") 32 | 33 | 34 | @pytest.fixture 35 | def idx_storage_backend_config(idx_storage_postgresql): 36 | """Basic pg storage configuration with no journal collaborator for the indexer 37 | storage (to avoid pulling optional dependency on clients of this fixture) 38 | 39 | """ 40 | return { 41 | "cls": "postgresql", 42 | "db": idx_storage_postgresql.info.dsn, 43 | } 44 | 45 | 46 | @pytest.fixture 47 | def swh_indexer_config( 48 | swh_storage_backend_config, 49 | idx_storage_backend_config, 50 | ): 51 | return { 52 | "storage": swh_storage_backend_config, 53 | "objstorage": {"cls": "memory"}, 54 | "indexer_storage": idx_storage_backend_config, 55 | "tools": { 56 | "name": "file", 57 | "version": "1:5.30-1+deb9u1", 58 | "configuration": {"type": "library", "debian-package": "python3-magic"}, 59 | }, 60 | "compute_checksums": ["blake2b512"], # for rehash indexer 61 | } 62 | 63 | 64 | @pytest.fixture 65 | def idx_storage(swh_indexer_config): 66 | """An instance of in-memory indexer storage that gets injected into all 67 | indexers classes. 68 | 69 | """ 70 | idx_storage_config = swh_indexer_config["indexer_storage"] 71 | return get_indexer_storage(**idx_storage_config) 72 | 73 | 74 | @pytest.fixture 75 | def storage(swh_indexer_config): 76 | """An instance of in-memory storage that gets injected into all indexers 77 | classes. 78 | 79 | """ 80 | storage = get_storage(**swh_indexer_config["storage"]) 81 | fill_storage(storage) 82 | return storage 83 | 84 | 85 | @pytest.fixture 86 | def obj_storage(swh_indexer_config): 87 | """An instance of in-memory objstorage that gets injected into all indexers 88 | classes. 89 | 90 | """ 91 | objstorage = get_objstorage(**swh_indexer_config["objstorage"]) 92 | fill_obj_storage(objstorage) 93 | with patch("swh.indexer.indexer.get_objstorage", return_value=objstorage): 94 | yield objstorage 95 | 96 | 97 | @pytest.fixture 98 | def swh_config(swh_indexer_config, monkeypatch, tmp_path): 99 | conffile = os.path.join(str(tmp_path), "indexer.yml") 100 | with open(conffile, "w") as f: 101 | f.write(yaml.dump(swh_indexer_config)) 102 | monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) 103 | return conffile 104 | -------------------------------------------------------------------------------- /swh/indexer/tests/metadata_dictionary/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoftwareHeritage/swh-indexer/ca2126e5bcd2fcfe06b35edea1f9dd671fd39b19/swh/indexer/tests/metadata_dictionary/__init__.py -------------------------------------------------------------------------------- /swh/indexer/tests/metadata_dictionary/test_composer.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from swh.indexer.metadata_dictionary import MAPPINGS 7 | 8 | 9 | def test_compute_metadata_composer(): 10 | raw_content = """{ 11 | "name": "symfony/polyfill-mbstring", 12 | "type": "library", 13 | "description": "Symfony polyfill for the Mbstring extension", 14 | "keywords": [ 15 | "polyfill", 16 | "shim", 17 | "compatibility", 18 | "portable" 19 | ], 20 | "homepage": "https://symfony.com", 21 | "license": "MIT", 22 | "authors": [ 23 | { 24 | "name": "Nicolas Grekas", 25 | "email": "p@tchwork.com" 26 | }, 27 | { 28 | "name": "Symfony Community", 29 | "homepage": "https://symfony.com/contributors" 30 | } 31 | ], 32 | "require": { 33 | "php": ">=7.1" 34 | }, 35 | "provide": { 36 | "ext-mbstring": "*" 37 | }, 38 | "autoload": { 39 | "files": [ 40 | "bootstrap.php" 41 | ] 42 | }, 43 | "suggest": { 44 | "ext-mbstring": "For best performance" 45 | }, 46 | "minimum-stability": "dev", 47 | "extra": { 48 | "branch-alias": { 49 | "dev-main": "1.26-dev" 50 | }, 51 | "thanks": { 52 | "name": "symfony/polyfill", 53 | "url": "https://github.com/symfony/polyfill" 54 | } 55 | } 56 | } 57 | """.encode( 58 | "utf-8" 59 | ) 60 | 61 | result = MAPPINGS["ComposerMapping"]().translate(raw_content) 62 | 63 | assert set(result.pop("keywords")) == { 64 | "polyfill", 65 | "shim", 66 | "compatibility", 67 | "portable", 68 | }, result 69 | expected = { 70 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 71 | "type": "SoftwareSourceCode", 72 | "name": "symfony/polyfill-mbstring", 73 | "description": "Symfony polyfill for the Mbstring extension", 74 | "url": "https://symfony.com", 75 | "license": "https://spdx.org/licenses/MIT", 76 | "author": [ 77 | { 78 | "type": "Person", 79 | "name": "Nicolas Grekas", 80 | "email": "p@tchwork.com", 81 | }, 82 | { 83 | "type": "Person", 84 | "name": "Symfony Community", 85 | }, 86 | ], 87 | } 88 | 89 | assert result == expected 90 | -------------------------------------------------------------------------------- /swh/indexer/tests/metadata_dictionary/test_dart.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import pytest 7 | 8 | from swh.indexer.metadata_dictionary import MAPPINGS 9 | 10 | 11 | def test_compute_metadata_pubspec(): 12 | raw_content = b""" 13 | --- 14 | name: newtify 15 | description: >- 16 | Have you been turned into a newt? Would you like to be? 17 | This package can help. It has all of the 18 | newt-transmogrification functionality you have been looking 19 | for. 20 | keywords: 21 | - polyfill 22 | - shim 23 | - compatibility 24 | - portable 25 | - mbstring 26 | version: 1.2.3 27 | license: MIT 28 | homepage: https://example-pet-store.com/newtify 29 | documentation: https://example-pet-store.com/newtify/docs 30 | 31 | environment: 32 | sdk: '>=2.10.0 <3.0.0' 33 | 34 | dependencies: 35 | efts: ^2.0.4 36 | transmogrify: ^0.4.0 37 | 38 | dev_dependencies: 39 | test: '>=1.15.0 <2.0.0' 40 | """ 41 | 42 | result = MAPPINGS["PubMapping"]().translate(raw_content) 43 | 44 | assert set(result.pop("keywords")) == { 45 | "polyfill", 46 | "shim", 47 | "compatibility", 48 | "portable", 49 | "mbstring", 50 | }, result 51 | expected = { 52 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 53 | "type": "SoftwareSourceCode", 54 | "name": "newtify", 55 | "description": """Have you been turned into a newt? Would you like to be? \ 56 | This package can help. It has all of the \ 57 | newt-transmogrification functionality you have been looking \ 58 | for.""", 59 | "url": "https://example-pet-store.com/newtify", 60 | "license": "https://spdx.org/licenses/MIT", 61 | } 62 | 63 | assert result == expected 64 | 65 | 66 | def test_normalize_author_pubspec(): 67 | raw_content = b""" 68 | author: Atlee Pine 69 | """ 70 | 71 | result = MAPPINGS["PubMapping"]().translate(raw_content) 72 | 73 | expected = { 74 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 75 | "type": "SoftwareSourceCode", 76 | "author": [ 77 | {"type": "Person", "name": "Atlee Pine", "email": "atlee@example.org"}, 78 | ], 79 | } 80 | 81 | assert result == expected 82 | 83 | 84 | def test_normalize_authors_pubspec(): 85 | raw_content = b""" 86 | authors: 87 | - Vicky Merzown 88 | - Ron Bilius Weasley 89 | """ 90 | 91 | result = MAPPINGS["PubMapping"]().translate(raw_content) 92 | 93 | expected = { 94 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 95 | "type": "SoftwareSourceCode", 96 | "author": [ 97 | {"type": "Person", "name": "Vicky Merzown", "email": "vmz@example.org"}, 98 | { 99 | "type": "Person", 100 | "name": "Ron Bilius Weasley", 101 | }, 102 | ], 103 | } 104 | 105 | assert result == expected 106 | 107 | 108 | @pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547") 109 | def test_normalize_author_authors_pubspec(): 110 | raw_content = b""" 111 | authors: 112 | - Vicky Merzown 113 | - Ron Bilius Weasley 114 | author: Hermione Granger 115 | """ 116 | 117 | result = MAPPINGS["PubMapping"]().translate(raw_content) 118 | 119 | expected = { 120 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 121 | "type": "SoftwareSourceCode", 122 | "author": [ 123 | {"type": "Person", "name": "Vicky Merzown", "email": "vmz@example.org"}, 124 | { 125 | "type": "Person", 126 | "name": "Ron Bilius Weasley", 127 | }, 128 | { 129 | "type": "Person", 130 | "name": "Hermione Granger", 131 | }, 132 | ], 133 | } 134 | 135 | assert result == expected 136 | 137 | 138 | def test_normalize_empty_authors(): 139 | raw_content = b""" 140 | authors: 141 | """ 142 | 143 | result = MAPPINGS["PubMapping"]().translate(raw_content) 144 | 145 | expected = { 146 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 147 | "type": "SoftwareSourceCode", 148 | } 149 | 150 | assert result == expected 151 | 152 | 153 | def test_invalid_yaml(): 154 | raw_content = b""" 155 | name: smartech_push 156 | license: { :type => "Commercial", :file => "LICENSE" } 157 | """ 158 | 159 | result = MAPPINGS["PubMapping"]().translate(raw_content) 160 | 161 | assert result is None 162 | 163 | 164 | def test_invalid_tag(): 165 | raw_content = b""" 166 | name: translatron 167 | description: !BETA VERSION - NOT FOR LIVE OR PROD USAGE! 168 | """ 169 | 170 | result = MAPPINGS["PubMapping"]().translate(raw_content) 171 | 172 | assert result is None 173 | -------------------------------------------------------------------------------- /swh/indexer/tests/metadata_dictionary/test_gitea.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from swh.indexer.metadata_dictionary import MAPPINGS 7 | 8 | CONTEXT = [ 9 | "https://doi.org/10.5063/schema/codemeta-2.0", 10 | { 11 | "as": "https://www.w3.org/ns/activitystreams#", 12 | "forge": "https://forgefed.org/ns#", 13 | "xsd": "http://www.w3.org/2001/XMLSchema#", 14 | }, 15 | ] 16 | 17 | 18 | def test_compute_metadata_none(): 19 | """ 20 | testing content empty content is empty 21 | should return None 22 | """ 23 | content = b"" 24 | 25 | # None if no metadata was found or an error occurred 26 | declared_metadata = None 27 | result = MAPPINGS["GiteaMapping"]().translate(content) 28 | assert declared_metadata == result 29 | 30 | 31 | def test_supported_terms(): 32 | terms = MAPPINGS["GiteaMapping"].supported_terms() 33 | assert { 34 | "http://schema.org/name", 35 | "http://schema.org/dateCreated", 36 | "https://forgefed.org/ns#forks", 37 | "https://www.w3.org/ns/activitystreams#totalItems", 38 | } <= terms 39 | 40 | 41 | def test_compute_metadata_gitea(): 42 | content = b""" 43 | { 44 | "id": 48043, 45 | "owner": { 46 | "id": 48018, 47 | "login": "ForgeFed", 48 | "full_name": "", 49 | "email": "", 50 | "avatar_url": "https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33", 51 | "language": "", 52 | "is_admin": false, 53 | "last_login": "0001-01-01T00:00:00Z", 54 | "created": "2022-04-30T20:13:17+02:00", 55 | "restricted": false, 56 | "active": false, 57 | "prohibit_login": false, 58 | "location": "", 59 | "website": "https://forgefed.org/", 60 | "description": "", 61 | "visibility": "public", 62 | "followers_count": 0, 63 | "following_count": 0, 64 | "starred_repos_count": 0, 65 | "username": "ForgeFed" 66 | }, 67 | "name": "ForgeFed", 68 | "full_name": "ForgeFed/ForgeFed", 69 | "description": "ActivityPub-based forge federation protocol specification", 70 | "empty": false, 71 | "private": false, 72 | "fork": false, 73 | "template": false, 74 | "parent": null, 75 | "mirror": false, 76 | "size": 3780, 77 | "language": "CSS", 78 | "languages_url": "https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages", 79 | "html_url": "https://codeberg.org/ForgeFed/ForgeFed", 80 | "ssh_url": "git@codeberg.org:ForgeFed/ForgeFed.git", 81 | "clone_url": "https://codeberg.org/ForgeFed/ForgeFed.git", 82 | "original_url": "https://notabug.org/peers/forgefed", 83 | "website": "https://forgefed.org", 84 | "stars_count": 30, 85 | "forks_count": 6, 86 | "watchers_count": 11, 87 | "open_issues_count": 61, 88 | "open_pr_counter": 10, 89 | "release_counter": 0, 90 | "default_branch": "main", 91 | "archived": false, 92 | "created_at": "2022-06-13T18:54:26+02:00", 93 | "updated_at": "2022-09-02T03:57:22+02:00", 94 | "permissions": { 95 | "admin": false, 96 | "push": false, 97 | "pull": true 98 | }, 99 | "has_issues": true, 100 | "internal_tracker": { 101 | "enable_time_tracker": true, 102 | "allow_only_contributors_to_track_time": true, 103 | "enable_issue_dependencies": true 104 | }, 105 | "has_wiki": false, 106 | "has_pull_requests": true, 107 | "has_projects": true, 108 | "ignore_whitespace_conflicts": false, 109 | "allow_merge_commits": false, 110 | "allow_rebase": false, 111 | "allow_rebase_explicit": false, 112 | "allow_squash_merge": true, 113 | "default_merge_style": "squash", 114 | "avatar_url": "", 115 | "internal": false, 116 | "mirror_interval": "", 117 | "mirror_updated": "0001-01-01T00:00:00Z", 118 | "repo_transfer": null 119 | } 120 | """ 121 | result = MAPPINGS["GiteaMapping"]().translate(content) 122 | assert result == { 123 | "@context": CONTEXT, 124 | "type": "forge:Repository", 125 | "id": "https://codeberg.org/ForgeFed/ForgeFed", 126 | "forge:forks": { 127 | "as:totalItems": {"type": "xsd:nonNegativeInteger", "@value": "6"}, 128 | "type": "as:OrderedCollection", 129 | }, 130 | "as:likes": { 131 | "as:totalItems": { 132 | "type": "xsd:nonNegativeInteger", 133 | "@value": "30", 134 | }, 135 | "type": "as:Collection", 136 | }, 137 | "as:followers": { 138 | "as:totalItems": { 139 | "type": "xsd:nonNegativeInteger", 140 | "@value": "11", 141 | }, 142 | "type": "as:Collection", 143 | }, 144 | "name": "ForgeFed", 145 | "description": "ActivityPub-based forge federation protocol specification", 146 | "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed.git", 147 | "dateCreated": "2022-06-13T18:54:26+02:00", 148 | "dateModified": "2022-09-02T03:57:22+02:00", 149 | "programmingLanguage": "CSS", 150 | "url": "https://forgefed.org", 151 | } 152 | 153 | 154 | def test_gitea_fork(): 155 | content = b""" 156 | { 157 | "name": "fork-name", 158 | "description": "fork description", 159 | "html_url": "http://example.org/test-fork", 160 | "parent": { 161 | "name": "parent-name", 162 | "description": "parent description", 163 | "html_url": "http://example.org/test-software" 164 | } 165 | } 166 | """ 167 | result = MAPPINGS["GiteaMapping"]().translate(content) 168 | assert result == { 169 | "@context": CONTEXT, 170 | "type": "forge:Repository", 171 | "id": "http://example.org/test-fork", 172 | "description": "fork description", 173 | "name": "fork-name", 174 | "forge:forkedFrom": { 175 | "id": "http://example.org/test-software", 176 | }, 177 | } 178 | -------------------------------------------------------------------------------- /swh/indexer/tests/metadata_dictionary/test_python.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2024 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from swh.indexer.metadata_dictionary import MAPPINGS 7 | from swh.indexer.metadata_dictionary.base import DirectoryLsEntry 8 | from swh.model.hashutil import hash_to_bytes 9 | from swh.objstorage.interface import CompositeObjId 10 | 11 | 12 | def test_compute_metadata_pkginfo(): 13 | raw_content = b"""\ 14 | Metadata-Version: 2.1 15 | Name: swh.core 16 | Version: 0.0.49 17 | Summary: Software Heritage core utilities 18 | Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ 19 | Author: Software Heritage developers 20 | Author-email: swh-devel@inria.fr 21 | License: UNKNOWN 22 | Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest 23 | Project-URL: Funding, https://www.softwareheritage.org/donate 24 | Project-URL: Source, https://forge.softwareheritage.org/source/swh-core 25 | Description: swh-core 26 | ======== 27 | \x20 28 | core library for swh's modules: 29 | - config parser 30 | - hash computations 31 | - serialization 32 | - logging mechanism 33 | \x20 34 | Platform: UNKNOWN 35 | Classifier: Programming Language :: Python :: 3 36 | Classifier: Intended Audience :: Developers 37 | Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) 38 | Classifier: Operating System :: OS Independent 39 | Classifier: Development Status :: 5 - Production/Stable 40 | Description-Content-Type: text/markdown 41 | Provides-Extra: testing 42 | """ # noqa 43 | result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) 44 | assert set(result.pop("description")) == { 45 | "Software Heritage core utilities", # note the comma here 46 | "swh-core\n" 47 | "========\n" 48 | "\n" 49 | "core library for swh's modules:\n" 50 | "- config parser\n" 51 | "- hash computations\n" 52 | "- serialization\n" 53 | "- logging mechanism\n" 54 | "", 55 | }, result 56 | assert result == { 57 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 58 | "type": "SoftwareSourceCode", 59 | "url": "https://forge.softwareheritage.org/diffusion/DCORE/", 60 | "name": "swh.core", 61 | "author": [ 62 | { 63 | "type": "Person", 64 | "name": "Software Heritage developers", 65 | "email": "swh-devel@inria.fr", 66 | } 67 | ], 68 | "version": "0.0.49", 69 | } 70 | 71 | 72 | def test_compute_metadata_pkginfo_utf8(): 73 | raw_content = b"""\ 74 | Metadata-Version: 1.1 75 | Name: snowpyt 76 | Description-Content-Type: UNKNOWN 77 | Description: foo 78 | Hydrology N\xc2\xb083 79 | """ # noqa 80 | result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) 81 | assert result == { 82 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 83 | "type": "SoftwareSourceCode", 84 | "name": "snowpyt", 85 | "description": "foo\nHydrology N°83", 86 | } 87 | 88 | 89 | def test_compute_metadata_pkginfo_keywords(): 90 | raw_content = b"""\ 91 | Metadata-Version: 2.1 92 | Name: foo 93 | Keywords: foo bar baz 94 | """ # noqa 95 | result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) 96 | assert set(result.pop("keywords")) == {"foo", "bar", "baz"}, result 97 | assert result == { 98 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 99 | "type": "SoftwareSourceCode", 100 | "name": "foo", 101 | } 102 | 103 | 104 | def test_compute_metadata_pkginfo_license(): 105 | raw_content = b"""\ 106 | Metadata-Version: 2.1 107 | Name: foo 108 | License: MIT 109 | """ # noqa 110 | result = MAPPINGS["PythonPkginfoMapping"]().translate(raw_content) 111 | assert result == { 112 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 113 | "type": "SoftwareSourceCode", 114 | "name": "foo", 115 | "license": "https://spdx.org/licenses/MIT", 116 | } 117 | 118 | 119 | def test_detect_metadata_files(): 120 | dir_entry = DirectoryLsEntry( 121 | type="file", 122 | name=b"PKG-INFO", 123 | target=hash_to_bytes("1" * 40), 124 | sha1=hash_to_bytes("2" * 40), 125 | ) 126 | result = MAPPINGS["PythonPkginfoMapping"]().detect_metadata_files([dir_entry]) 127 | assert result == [CompositeObjId(sha1=dir_entry["sha1"])] 128 | -------------------------------------------------------------------------------- /swh/indexer/tests/metadata_dictionary/test_ruby.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from hypothesis import HealthCheck, given, settings, strategies 7 | import pytest 8 | 9 | from swh.indexer.metadata_dictionary import MAPPINGS 10 | 11 | 12 | def test_gemspec_base(): 13 | raw_content = b""" 14 | Gem::Specification.new do |s| 15 | s.name = 'example' 16 | s.version = '0.1.0' 17 | s.licenses = ['MIT'] 18 | s.summary = "This is an example!" 19 | s.description = "Much longer explanation of the example!" 20 | s.authors = ["Ruby Coder"] 21 | s.email = 'rubycoder@example.com' 22 | s.files = ["lib/example.rb"] 23 | s.homepage = 'https://rubygems.org/gems/example' 24 | s.metadata = { "source_code_uri" => "https://github.com/example/example" } 25 | end""" 26 | result = MAPPINGS["GemspecMapping"]().translate(raw_content) 27 | assert set(result.pop("description")) == { 28 | "This is an example!", 29 | "Much longer explanation of the example!", 30 | } 31 | assert result == { 32 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 33 | "type": "SoftwareSourceCode", 34 | "author": [{"type": "Person", "name": "Ruby Coder"}], 35 | "name": "example", 36 | "license": "https://spdx.org/licenses/MIT", 37 | "codeRepository": "https://rubygems.org/gems/example", 38 | "email": "rubycoder@example.com", 39 | "version": "0.1.0", 40 | } 41 | 42 | 43 | @pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547") 44 | def test_gemspec_two_author_fields(): 45 | raw_content = b""" 46 | Gem::Specification.new do |s| 47 | s.authors = ["Ruby Coder1"] 48 | s.author = "Ruby Coder2" 49 | end""" 50 | result = MAPPINGS["GemspecMapping"]().translate(raw_content) 51 | assert result.pop("author") in ( 52 | [ 53 | {"type": "Person", "name": "Ruby Coder1"}, 54 | {"type": "Person", "name": "Ruby Coder2"}, 55 | ], 56 | [ 57 | {"type": "Person", "name": "Ruby Coder2"}, 58 | {"type": "Person", "name": "Ruby Coder1"}, 59 | ], 60 | ) 61 | assert result == { 62 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 63 | "type": "SoftwareSourceCode", 64 | } 65 | 66 | 67 | def test_gemspec_invalid_author(): 68 | raw_content = b""" 69 | Gem::Specification.new do |s| 70 | s.author = ["Ruby Coder"] 71 | end""" 72 | result = MAPPINGS["GemspecMapping"]().translate(raw_content) 73 | assert result == { 74 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 75 | "type": "SoftwareSourceCode", 76 | } 77 | raw_content = b""" 78 | Gem::Specification.new do |s| 79 | s.author = "Ruby Coder1", 80 | end""" 81 | result = MAPPINGS["GemspecMapping"]().translate(raw_content) 82 | assert result == { 83 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 84 | "type": "SoftwareSourceCode", 85 | } 86 | raw_content = b""" 87 | Gem::Specification.new do |s| 88 | s.authors = ["Ruby Coder1", ["Ruby Coder2"]] 89 | end""" 90 | result = MAPPINGS["GemspecMapping"]().translate(raw_content) 91 | assert result == { 92 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 93 | "type": "SoftwareSourceCode", 94 | "author": [{"type": "Person", "name": "Ruby Coder1"}], 95 | } 96 | 97 | 98 | def test_gemspec_alternative_header(): 99 | raw_content = b""" 100 | require './lib/version' 101 | 102 | Gem::Specification.new { |s| 103 | s.name = 'rb-system-with-aliases' 104 | s.summary = 'execute system commands with aliases' 105 | } 106 | """ 107 | result = MAPPINGS["GemspecMapping"]().translate(raw_content) 108 | assert result == { 109 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 110 | "type": "SoftwareSourceCode", 111 | "name": "rb-system-with-aliases", 112 | "description": "execute system commands with aliases", 113 | } 114 | 115 | 116 | @settings(suppress_health_check=[HealthCheck.too_slow]) 117 | @given( 118 | strategies.dictionaries( 119 | # keys 120 | strategies.one_of( 121 | strategies.text(), 122 | *map(strategies.just, MAPPINGS["GemspecMapping"].mapping), # type: ignore 123 | ), 124 | # values 125 | strategies.recursive( 126 | strategies.characters(), 127 | lambda children: strategies.lists(children, min_size=1), 128 | ), 129 | ) 130 | ) 131 | def test_gemspec_adversarial(doc): 132 | parts = [b"Gem::Specification.new do |s|\n"] 133 | for k, v in doc.items(): 134 | parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) 135 | parts.append(b"end\n") 136 | MAPPINGS["GemspecMapping"]().translate(b"".join(parts)) 137 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from os import path 7 | 8 | import swh.indexer 9 | 10 | SQL_DIR = path.join(path.dirname(swh.indexer.__file__), "sql") 11 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2019 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from os.path import join 7 | 8 | import pytest 9 | 10 | from swh.indexer.storage import get_indexer_storage 11 | from swh.indexer.storage.model import ContentLicenseRow, ContentMimetypeRow 12 | from swh.indexer.tests.conftest import idx_storage_postgresql 13 | from swh.model.hashutil import hash_to_bytes 14 | 15 | from . import SQL_DIR 16 | from .generate_data_test import FOSSOLOGY_LICENSES, MIMETYPE_OBJECTS, TOOLS 17 | 18 | DUMP_FILES = join(SQL_DIR, "*.sql") 19 | 20 | 21 | class DataObj(dict): 22 | def __getattr__(self, key): 23 | return self.__getitem__(key) 24 | 25 | def __setattr__(self, key, value): 26 | return self.__setitem__(key, value) 27 | 28 | 29 | @pytest.fixture 30 | def swh_indexer_storage_with_data(swh_indexer_storage): 31 | data = DataObj() 32 | tools = { 33 | tool["tool_name"]: { 34 | "id": tool["id"], 35 | "name": tool["tool_name"], 36 | "version": tool["tool_version"], 37 | "configuration": tool["tool_configuration"], 38 | } 39 | for tool in swh_indexer_storage.indexer_configuration_add(TOOLS) 40 | } 41 | data.tools = tools 42 | data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689") 43 | data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7") 44 | data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") 45 | data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") 46 | data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") 47 | data.origin_url_1 = "file:///dev/0/zero" # 44434341 48 | data.origin_url_2 = "file:///dev/1/one" # 44434342 49 | data.origin_url_3 = "file:///dev/2/two" # 54974445 50 | data.mimetypes = [ 51 | ContentMimetypeRow(indexer_configuration_id=tools["file"]["id"], **mimetype_obj) 52 | for mimetype_obj in MIMETYPE_OBJECTS 53 | ] 54 | swh_indexer_storage.content_mimetype_add(data.mimetypes) 55 | data.fossology_licenses = [ 56 | ContentLicenseRow( 57 | id=fossology_obj["id"], 58 | indexer_configuration_id=tools["nomos"]["id"], 59 | license=license, 60 | ) 61 | for fossology_obj in FOSSOLOGY_LICENSES 62 | for license in fossology_obj["licenses"] 63 | ] 64 | swh_indexer_storage._test_data = data 65 | 66 | return (swh_indexer_storage, data) 67 | 68 | 69 | swh_indexer_storage_postgresql = idx_storage_postgresql 70 | 71 | 72 | @pytest.fixture 73 | def swh_indexer_storage(swh_indexer_storage_postgresql): 74 | return get_indexer_storage( 75 | "postgresql", 76 | db=swh_indexer_storage_postgresql.info.dsn, 77 | journal_writer={ 78 | "cls": "memory", 79 | }, 80 | ) 81 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/generate_data_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from uuid import uuid1 7 | 8 | from hypothesis.strategies import composite, one_of, sampled_from, sets, tuples, uuids 9 | 10 | from swh.model.hashutil import MultiHash 11 | 12 | MIMETYPES = [ 13 | b"application/json", 14 | b"application/octet-stream", 15 | b"application/xml", 16 | b"text/plain", 17 | ] 18 | 19 | ENCODINGS = [ 20 | b"iso8859-1", 21 | b"iso8859-15", 22 | b"latin1", 23 | b"utf-8", 24 | ] 25 | 26 | 27 | def gen_mimetype(): 28 | """Generate one mimetype strategy.""" 29 | return one_of(sampled_from(MIMETYPES)) 30 | 31 | 32 | def gen_encoding(): 33 | """Generate one encoding strategy.""" 34 | return one_of(sampled_from(ENCODINGS)) 35 | 36 | 37 | def _init_content(uuid): 38 | """Given a uuid, initialize a content""" 39 | return { 40 | "id": MultiHash.from_data(uuid.bytes, {"sha1"}).digest()["sha1"], 41 | "indexer_configuration_id": 1, 42 | } 43 | 44 | 45 | @composite 46 | def gen_content_mimetypes(draw, *, min_size=0, max_size=100): 47 | """Generate valid and consistent content_mimetypes. 48 | 49 | Context: Test purposes 50 | 51 | Args: 52 | **draw** (callable): Used by hypothesis to generate data 53 | **min_size** (int): Minimal number of elements to generate 54 | (default: 0) 55 | **max_size** (int): Maximal number of elements to generate 56 | (default: 100) 57 | 58 | Returns: 59 | List of content_mimetypes as expected by the 60 | content_mimetype_add api endpoint. 61 | 62 | """ 63 | _ids = draw( 64 | sets( 65 | tuples(uuids(), gen_mimetype(), gen_encoding()), 66 | min_size=min_size, 67 | max_size=max_size, 68 | ) 69 | ) 70 | 71 | content_mimetypes = [] 72 | for uuid, mimetype, encoding in _ids: 73 | content_mimetypes.append( 74 | { 75 | **_init_content(uuid), 76 | "mimetype": mimetype, 77 | "encoding": encoding, 78 | } 79 | ) 80 | return content_mimetypes 81 | 82 | 83 | TOOLS = [ 84 | { 85 | "tool_name": "swh-metadata-translator", 86 | "tool_version": "0.0.1", 87 | "tool_configuration": {"type": "local", "context": "NpmMapping"}, 88 | }, 89 | { 90 | "tool_name": "swh-metadata-detector", 91 | "tool_version": "0.0.1", 92 | "tool_configuration": { 93 | "type": "local", 94 | "context": ["NpmMapping", "CodemetaMapping"], 95 | }, 96 | }, 97 | { 98 | "tool_name": "swh-metadata-detector2", 99 | "tool_version": "0.0.1", 100 | "tool_configuration": { 101 | "type": "local", 102 | "context": ["NpmMapping", "CodemetaMapping"], 103 | }, 104 | }, 105 | { 106 | "tool_name": "file", 107 | "tool_version": "5.22", 108 | "tool_configuration": {"command_line": "file --mime "}, 109 | }, 110 | { 111 | "tool_name": "pygments", 112 | "tool_version": "2.0.1+dfsg-1.1+deb8u1", 113 | "tool_configuration": {"type": "library", "debian-package": "python3-pygments"}, 114 | }, 115 | { 116 | "tool_name": "pygments2", 117 | "tool_version": "2.0.1+dfsg-1.1+deb8u1", 118 | "tool_configuration": { 119 | "type": "library", 120 | "debian-package": "python3-pygments", 121 | "max_content_size": 10240, 122 | }, 123 | }, 124 | { 125 | "tool_name": "nomos", 126 | "tool_version": "3.1.0rc2-31-ga2cbb8c", 127 | "tool_configuration": {"command_line": "nomossa "}, 128 | }, 129 | ] 130 | 131 | 132 | MIMETYPE_OBJECTS = [ 133 | { 134 | "id": MultiHash.from_data(uuid1().bytes, {"sha1"}).digest()["sha1"], 135 | "mimetype": mt, 136 | "encoding": enc, 137 | # 'indexer_configuration_id' will be added after TOOLS get registered 138 | } 139 | for mt in MIMETYPES 140 | for enc in ENCODINGS 141 | ] 142 | 143 | LICENSES = [ 144 | b"3DFX", 145 | b"BSD", 146 | b"GPL", 147 | b"Apache2", 148 | b"MIT", 149 | ] 150 | 151 | FOSSOLOGY_LICENSES = [ 152 | { 153 | "id": MultiHash.from_data(uuid1().bytes, {"sha1"}).digest()["sha1"], 154 | "licenses": [ 155 | LICENSES[i % len(LICENSES)], 156 | ], 157 | # 'indexer_configuration_id' will be added after TOOLS get registered 158 | } 159 | for i in range(10) 160 | ] 161 | 162 | 163 | def gen_license(): 164 | return one_of(sampled_from(LICENSES)) 165 | 166 | 167 | @composite 168 | def gen_content_fossology_licenses(draw, *, min_size=0, max_size=100): 169 | """Generate valid and consistent content_fossology_licenses. 170 | 171 | Context: Test purposes 172 | 173 | Args: 174 | **draw** (callable): Used by hypothesis to generate data 175 | **min_size** (int): Minimal number of elements to generate 176 | (default: 0) 177 | **max_size** (int): Maximal number of elements to generate 178 | (default: 100) 179 | 180 | Returns: 181 | List of content_fossology_licenses as expected by the 182 | content_fossology_license_add api endpoint. 183 | 184 | """ 185 | _ids = draw( 186 | sets( 187 | tuples( 188 | uuids(), 189 | gen_license(), 190 | ), 191 | min_size=min_size, 192 | max_size=max_size, 193 | ) 194 | ) 195 | 196 | content_licenses = [] 197 | for uuid, license in _ids: 198 | content_licenses.append( 199 | { 200 | **_init_content(uuid), 201 | "licenses": [license], 202 | } 203 | ) 204 | return content_licenses 205 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/test_api_client.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2023 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import psycopg 7 | import pytest 8 | 9 | from swh.core.api import RemoteException, TransientRemoteException 10 | from swh.indexer.storage import get_indexer_storage 11 | from swh.indexer.storage.api.client import RemoteStorage 12 | import swh.indexer.storage.api.server as server 13 | 14 | from .test_storage import * # noqa 15 | 16 | 17 | @pytest.fixture 18 | def app_server(swh_indexer_storage_postgresql): 19 | server.storage = get_indexer_storage( 20 | "postgresql", 21 | db=swh_indexer_storage_postgresql.info.dsn, 22 | journal_writer={ 23 | "cls": "memory", 24 | }, 25 | ) 26 | yield server 27 | 28 | 29 | @pytest.fixture 30 | def app(app_server): 31 | return app_server.app 32 | 33 | 34 | @pytest.fixture 35 | def swh_rpc_client_class(): 36 | # these are needed for the swh_indexer_storage_with_data fixture 37 | assert hasattr(RemoteStorage, "indexer_configuration_add") 38 | assert hasattr(RemoteStorage, "content_mimetype_add") 39 | return RemoteStorage 40 | 41 | 42 | @pytest.fixture 43 | def swh_indexer_storage(swh_rpc_client, app_server): 44 | # This version of the swh_storage fixture uses the swh_rpc_client fixture 45 | # to instantiate a RemoteStorage (see swh_rpc_client_class above) that 46 | # proxies, via the swh.core RPC mechanism, the local (in memory) storage 47 | # configured in the app fixture above. 48 | # 49 | # Also note that, for the sake of 50 | # making it easier to write tests, the in-memory journal writer of the 51 | # in-memory backend storage is attached to the RemoteStorage as its 52 | # journal_writer attribute. 53 | storage = swh_rpc_client 54 | 55 | journal_writer = getattr(storage, "journal_writer", None) 56 | storage.journal_writer = app_server.storage.journal_writer 57 | yield storage 58 | storage.journal_writer = journal_writer 59 | 60 | 61 | def test_exception(app_server, swh_indexer_storage, mocker): 62 | """Checks the client re-raises unknown exceptions as a :exc:`RemoteException`""" 63 | assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == [] 64 | mocker.patch.object( 65 | app_server.storage, 66 | "content_mimetype_get", 67 | side_effect=ValueError("crash"), 68 | ) 69 | with pytest.raises(RemoteException) as e: 70 | swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) 71 | assert not isinstance(e, TransientRemoteException) 72 | 73 | 74 | def test_operationalerror_exception(app_server, swh_indexer_storage, mocker): 75 | """Checks the client re-raises as a :exc:`TransientRemoteException` 76 | rather than the base :exc:`RemoteException`; so the retrying proxy 77 | retries for longer.""" 78 | assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == [] 79 | mocker.patch.object( 80 | app_server.storage, 81 | "content_mimetype_get", 82 | side_effect=psycopg.errors.AdminShutdown("cluster is shutting down"), 83 | ) 84 | with pytest.raises(RemoteException) as excinfo: 85 | swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) 86 | assert isinstance(excinfo.value, TransientRemoteException) 87 | 88 | 89 | def test_querycancelled_exception(app_server, swh_indexer_storage, mocker): 90 | """Checks the client re-raises as a :exc:`TransientRemoteException` 91 | rather than the base :exc:`RemoteException`; so the retrying proxy 92 | retries for longer.""" 93 | assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == [] 94 | mocker.patch.object( 95 | app_server.storage, 96 | "content_mimetype_get", 97 | side_effect=psycopg.errors.QueryCanceled("too big!"), 98 | ) 99 | with pytest.raises(RemoteException) as excinfo: 100 | swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) 101 | assert not isinstance(excinfo.value, TransientRemoteException) 102 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/test_converters.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from swh.indexer.storage import converters 7 | 8 | 9 | def test_db_to_mimetype() -> None: 10 | input_mimetype = { 11 | "id": b"some-id", 12 | "tool_id": 10, 13 | "tool_name": "some-toolname", 14 | "tool_version": "some-toolversion", 15 | "tool_configuration": {}, 16 | "encoding": b"ascii", 17 | "mimetype": b"text/plain", 18 | } 19 | 20 | expected_mimetype = { 21 | "id": b"some-id", 22 | "encoding": b"ascii", 23 | "mimetype": b"text/plain", 24 | "tool": { 25 | "id": 10, 26 | "name": "some-toolname", 27 | "version": "some-toolversion", 28 | "configuration": {}, 29 | }, 30 | } 31 | 32 | actual_mimetype = converters.db_to_mimetype(input_mimetype) 33 | 34 | assert actual_mimetype == expected_mimetype 35 | 36 | 37 | def test_db_to_fossology_license() -> None: 38 | input_license = { 39 | "id": b"some-id", 40 | "tool_id": 20, 41 | "tool_name": "nomossa", 42 | "tool_version": "5.22", 43 | "tool_configuration": {}, 44 | "license": "GPL2.0", 45 | } 46 | 47 | expected_license = { 48 | "id": b"some-id", 49 | "license": "GPL2.0", 50 | "tool": { 51 | "id": 20, 52 | "name": "nomossa", 53 | "version": "5.22", 54 | "configuration": {}, 55 | }, 56 | } 57 | 58 | actual_license = converters.db_to_fossology_license(input_license) 59 | 60 | assert actual_license == expected_license 61 | 62 | 63 | def test_db_to_metadata() -> None: 64 | input_metadata = { 65 | "id": b"some-id", 66 | "tool_id": 20, 67 | "tool_name": "some-toolname", 68 | "tool_version": "some-toolversion", 69 | "tool_configuration": {}, 70 | "metadata": b"metadata", 71 | } 72 | 73 | expected_metadata = { 74 | "id": b"some-id", 75 | "metadata": b"metadata", 76 | "tool": { 77 | "id": 20, 78 | "name": "some-toolname", 79 | "version": "some-toolversion", 80 | "configuration": {}, 81 | }, 82 | } 83 | 84 | actual_metadata = converters.db_to_metadata(input_metadata) 85 | 86 | assert actual_metadata == expected_metadata 87 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/test_in_memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015-2019 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import pytest 7 | 8 | from swh.indexer.storage import get_indexer_storage 9 | 10 | from .test_storage import * # noqa 11 | 12 | 13 | @pytest.fixture 14 | def swh_indexer_storage(): 15 | return get_indexer_storage( 16 | "memory", 17 | journal_writer={ 18 | "cls": "memory", 19 | }, 20 | ) 21 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/test_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2019-2020 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from unittest.mock import patch 7 | 8 | from swh.indexer.storage.metrics import ( 9 | OPERATIONS_METRIC, 10 | OPERATIONS_UNIT_METRIC, 11 | send_metric, 12 | ) 13 | 14 | 15 | def test_send_metric_unknown_unit() -> None: 16 | r = send_metric("content", count=10, method_name="content_add") 17 | assert r is False 18 | r = send_metric("sthg:add:bytes:extra", count=10, method_name="sthg_add") 19 | assert r is False 20 | 21 | 22 | def test_send_metric_no_value() -> None: 23 | r = send_metric("content_mimetype:add", count=0, method_name="content_mimetype_add") 24 | assert r is False 25 | 26 | 27 | @patch("swh.indexer.storage.metrics.statsd.increment") 28 | def test_send_metric_no_unit(mock_statsd) -> None: 29 | r = send_metric( 30 | "content_mimetype:add", count=10, method_name="content_mimetype_add" 31 | ) 32 | 33 | mock_statsd.assert_called_with( 34 | OPERATIONS_METRIC, 35 | 10, 36 | tags={ 37 | "endpoint": "content_mimetype_add", 38 | "object_type": "content_mimetype", 39 | "operation": "add", 40 | }, 41 | ) 42 | 43 | assert r 44 | 45 | 46 | @patch("swh.indexer.storage.metrics.statsd.increment") 47 | def test_send_metric_unit(mock_statsd) -> None: 48 | unit_ = "bytes" 49 | r = send_metric("c:add:%s" % unit_, count=100, method_name="c_add") 50 | 51 | expected_metric = OPERATIONS_UNIT_METRIC.format(unit=unit_) 52 | mock_statsd.assert_called_with( 53 | expected_metric, 54 | 100, 55 | tags={ 56 | "endpoint": "c_add", 57 | "object_type": "c", 58 | "operation": "add", 59 | }, 60 | ) 61 | 62 | assert r 63 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/test_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020-2022 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import pytest 7 | 8 | from swh.indexer.storage.model import BaseRow, ContentLicenseRow 9 | 10 | 11 | def test_unique_key__no_tool_dict(): 12 | with pytest.raises(ValueError, match="indexer_configuration_id"): 13 | BaseRow(id=12, indexer_configuration_id=34).unique_key() 14 | with pytest.raises(ValueError, match="indexer_configuration_id"): 15 | ContentLicenseRow( 16 | id=12, indexer_configuration_id=34, license="BSD" 17 | ).unique_key() 18 | 19 | 20 | def test_unique_key(): 21 | assert BaseRow( 22 | id=12, tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}} 23 | ).unique_key() == { 24 | "id": 12, 25 | "tool_name": "foo", 26 | "tool_version": "1.2.3", 27 | "tool_configuration": "{}", 28 | } 29 | 30 | assert ContentLicenseRow( 31 | id=12, 32 | tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}}, 33 | license="BSD", 34 | ).unique_key() == { 35 | "id": 12, 36 | "license": "BSD", 37 | "tool_name": "foo", 38 | "tool_version": "1.2.3", 39 | "tool_configuration": "{}", 40 | } 41 | 42 | assert ContentLicenseRow( 43 | id=12, 44 | tool={ 45 | "id": 34, 46 | "name": "foo", 47 | "version": "1.2.3", 48 | "configuration": {"foo": 1, "bar": 2}, 49 | }, 50 | license="BSD", 51 | ).unique_key() == { 52 | "id": 12, 53 | "license": "BSD", 54 | "tool_name": "foo", 55 | "tool_version": "1.2.3", 56 | "tool_configuration": '{"bar": 2, "foo": 1}', 57 | } 58 | -------------------------------------------------------------------------------- /swh/indexer/tests/storage/test_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2019 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | import pytest 7 | import yaml 8 | 9 | from swh.indexer.storage.api.server import load_and_check_config 10 | 11 | 12 | def prepare_config_file(tmpdir, content, name="config.yml") -> str: 13 | """Prepare configuration file in `$tmpdir/name` with content `content`. 14 | 15 | Args: 16 | tmpdir (LocalPath): root directory 17 | content (str/dict): Content of the file either as string or as a dict. 18 | If a dict, converts the dict into a yaml string. 19 | name (str): configuration filename 20 | 21 | Returns 22 | path (str) of the configuration file prepared. 23 | 24 | """ 25 | config_path = tmpdir / name 26 | if isinstance(content, dict): # convert if needed 27 | content = yaml.dump(content) 28 | config_path.write_text(content, encoding="utf-8") 29 | # pytest on python3.5 does not support LocalPath manipulation, so 30 | # convert path to string 31 | return str(config_path) 32 | 33 | 34 | @pytest.mark.parametrize("config_path", [None, ""]) 35 | def test_load_and_check_config_no_configuration(config_path) -> None: 36 | """Irrelevant configuration file path raises""" 37 | with pytest.raises(EnvironmentError, match="Configuration file must be defined"): 38 | load_and_check_config(config_path) 39 | 40 | 41 | def test_load_and_check_inexistent_config_path() -> None: 42 | """Inexistent configuration file raises""" 43 | config_path = "/indexer/inexistent/config.yml" 44 | expected_error = f"Configuration file {config_path} does not exist" 45 | with pytest.raises(FileNotFoundError, match=expected_error): 46 | load_and_check_config(config_path) 47 | 48 | 49 | def test_load_and_check_config_wrong_configuration(tmpdir) -> None: 50 | """Wrong configuration raises""" 51 | config_path = prepare_config_file(tmpdir, "something: useless") 52 | with pytest.raises(KeyError, match="Missing '%indexer_storage' configuration"): 53 | load_and_check_config(config_path) 54 | 55 | 56 | def test_load_and_check_config_remote_config_fine(tmpdir) -> None: 57 | """'Remote configuration is fine (when changing the default type)""" 58 | config = {"indexer_storage": {"cls": "remote"}} 59 | config_path = prepare_config_file(tmpdir, config) 60 | cfg = load_and_check_config(config_path) 61 | 62 | assert cfg == config 63 | 64 | 65 | def test_load_and_check_config_local_config_fine(tmpdir) -> None: 66 | """'Complete 'postgresql' configuration is fine""" 67 | config = { 68 | "indexer_storage": { 69 | "cls": "postgresql", 70 | "db": "db", 71 | } 72 | } 73 | config_path = prepare_config_file(tmpdir, config) 74 | cfg = load_and_check_config(config_path) 75 | assert cfg == config 76 | 77 | 78 | def test_load_and_check_config_deprecated(tmpdir) -> None: 79 | """'Complete 'local' configuration is fine""" 80 | config = { 81 | "indexer.storage": { 82 | "cls": "postgresql", 83 | "db": "db", 84 | } 85 | } 86 | config_path = prepare_config_file(tmpdir, config) 87 | with pytest.warns(DeprecationWarning): 88 | cfg = load_and_check_config(config_path) 89 | assert "indexer_storage" in cfg 90 | -------------------------------------------------------------------------------- /swh/indexer/tests/test_fossology_license.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2024 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from typing import Any, Dict 7 | import unittest 8 | from unittest.mock import patch 9 | 10 | import pytest 11 | 12 | from swh.indexer import fossology_license 13 | from swh.indexer.fossology_license import FossologyLicenseIndexer, compute_license 14 | from swh.indexer.storage.model import ContentLicenseRow 15 | from swh.indexer.tests.utils import ( 16 | BASE_TEST_CONFIG, 17 | RAW_CONTENT_OBJIDS, 18 | SHA1_TO_LICENSES, 19 | CommonContentIndexerTest, 20 | fill_obj_storage, 21 | fill_storage, 22 | filter_dict, 23 | mock_compute_license, 24 | ) 25 | 26 | 27 | class BasicTest(unittest.TestCase): 28 | @patch("swh.indexer.fossology_license.subprocess") 29 | def test_compute_license(self, mock_subprocess): 30 | """Computing licenses from a raw content should return results""" 31 | for path, intermediary_result, output in [ 32 | (b"some/path", None, []), 33 | (b"some/path/2", [], []), 34 | (b"other/path", " contains license(s) GPL,AGPL", ["GPL", "AGPL"]), 35 | ]: 36 | mock_subprocess.check_output.return_value = intermediary_result 37 | 38 | actual_result = compute_license(path) 39 | 40 | self.assertEqual( 41 | actual_result, 42 | { 43 | "licenses": output, 44 | "path": path, 45 | }, 46 | ) 47 | 48 | 49 | CONFIG: Dict[str, Any] = { 50 | **BASE_TEST_CONFIG, 51 | "workdir": "/tmp", 52 | "tools": { 53 | "name": "nomos", 54 | "version": "3.1.0rc2-31-ga2cbb8c", 55 | "configuration": { 56 | "command_line": "nomossa ", 57 | }, 58 | }, 59 | } 60 | 61 | RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)]) 62 | 63 | 64 | class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): 65 | """Fossology license indexer test scenarios: 66 | 67 | - Known sha1s in the input list have their data indexed 68 | - Unknown sha1 in the input list are not indexed 69 | 70 | """ 71 | 72 | def get_indexer_results(self, ids): 73 | yield from self.idx_storage.content_fossology_license_get(ids) 74 | 75 | def setUp(self): 76 | super().setUp() 77 | # replace actual license computation with a mock 78 | self.orig_compute_license = fossology_license.compute_license 79 | fossology_license.compute_license = mock_compute_license 80 | 81 | self.indexer = FossologyLicenseIndexer(CONFIG) 82 | self.indexer.catch_exceptions = False 83 | self.idx_storage = self.indexer.idx_storage 84 | fill_storage(self.indexer.storage) 85 | fill_obj_storage(self.indexer.objstorage) 86 | 87 | self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS 88 | 89 | tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} 90 | 91 | # then 92 | self.expected_results = [ 93 | *[ 94 | ContentLicenseRow(id=self.id0["sha1"], tool=tool, license=license) 95 | for license in SHA1_TO_LICENSES[self.id0["sha1"]] 96 | ], 97 | *[ 98 | ContentLicenseRow(id=self.id1["sha1"], tool=tool, license=license) 99 | for license in SHA1_TO_LICENSES[self.id1["sha1"]] 100 | ], 101 | *[], # self.id2 102 | ] 103 | 104 | def tearDown(self): 105 | super().tearDown() 106 | fossology_license.compute_license = self.orig_compute_license 107 | 108 | 109 | def test_fossology_w_no_tool(): 110 | with pytest.raises(ValueError): 111 | FossologyLicenseIndexer(config=filter_dict(CONFIG, "tools")) 112 | -------------------------------------------------------------------------------- /swh/indexer/tests/test_mimetype.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017-2023 The Software Heritage developers 2 | # See the AUTHORS file at the top-level directory of this distribution 3 | # License: GNU General Public License version 3, or any later version 4 | # See top-level LICENSE file for more information 5 | 6 | from typing import Any, Dict 7 | import unittest 8 | 9 | import pytest 10 | 11 | from swh.indexer.mimetype import MimetypeIndexer, compute_mimetype_encoding 12 | from swh.indexer.storage.model import ContentMimetypeRow 13 | from swh.indexer.tests.utils import ( 14 | BASE_TEST_CONFIG, 15 | RAW_CONTENT_OBJIDS, 16 | RAW_CONTENTS, 17 | CommonContentIndexerTest, 18 | fill_obj_storage, 19 | fill_storage, 20 | filter_dict, 21 | ) 22 | 23 | 24 | @pytest.mark.parametrize( 25 | "content_id,raw_text,mimetypes,encoding", 26 | RAW_CONTENTS, 27 | ) 28 | def test_compute_mimetype_encoding(content_id, raw_text, mimetypes, encoding): 29 | """Compute mimetype encoding should return results""" 30 | actual_result = compute_mimetype_encoding(raw_text) 31 | 32 | # Older libmagic versions (e.g. buster: 1:5.35-4+deb10u2, bullseye: 1:5.39-3) 33 | # returns different results. This allows to deal with such a case when executing 34 | # tests on different environments machines (e.g. ci tox, ci debian, dev machine, 35 | # ...) 36 | all_mimetypes = mimetypes if isinstance(mimetypes, tuple) else [mimetypes] 37 | 38 | assert actual_result in [ 39 | {"mimetype": mimetype, "encoding": encoding} for mimetype in all_mimetypes 40 | ] 41 | 42 | 43 | CONFIG: Dict[str, Any] = { 44 | **BASE_TEST_CONFIG, 45 | "tools": { 46 | "name": "file", 47 | "version": "1:5.30-1+deb9u1", 48 | "configuration": {"type": "library", "debian-package": "python3-magic"}, 49 | }, 50 | } 51 | 52 | 53 | class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): 54 | """Mimetype indexer test scenarios: 55 | 56 | - Known sha1s in the input list have their data indexed 57 | - Unknown sha1 in the input list are not indexed 58 | 59 | """ 60 | 61 | def get_indexer_results(self, ids): 62 | yield from self.idx_storage.content_mimetype_get(ids) 63 | 64 | def setUp(self): 65 | self.indexer = MimetypeIndexer(config=CONFIG) 66 | self.indexer.catch_exceptions = False 67 | self.idx_storage = self.indexer.idx_storage 68 | fill_storage(self.indexer.storage) 69 | fill_obj_storage(self.indexer.objstorage) 70 | 71 | self.id0, self.id1, self.id2 = RAW_CONTENT_OBJIDS 72 | 73 | tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} 74 | 75 | results = [] 76 | for raw_content_id, raw_content, mimetypes, encoding in RAW_CONTENTS: 77 | # Older libmagic versions (e.g. buster: 1:5.35-4+deb10u2, bullseye: 78 | # 1:5.39-3) returns different results. This allows to deal with such a case 79 | # when executing tests on different environments machines (e.g. ci tox, ci 80 | # debian, dev machine, ...) 81 | all_mimetypes = mimetypes if isinstance(mimetypes, tuple) else [mimetypes] 82 | 83 | results.extend( 84 | [ 85 | ContentMimetypeRow( 86 | id=raw_content_id["sha1"], 87 | tool=tool, 88 | mimetype=mimetype, 89 | encoding=encoding, 90 | ) 91 | for mimetype in all_mimetypes 92 | ] 93 | ) 94 | 95 | self.expected_results = results 96 | 97 | 98 | RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)]) 99 | 100 | 101 | def test_mimetype_w_no_tool(): 102 | with pytest.raises(ValueError): 103 | MimetypeIndexer(config=filter_dict(CONFIG, "tools")) 104 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 4 3 | envlist = 4 | black 5 | flake8 6 | mypy 7 | py3 8 | 9 | [testenv] 10 | usedevelop = true 11 | extras = 12 | testing 13 | deps = 14 | pytest-cov 15 | commands = 16 | pytest --doctest-modules \ 17 | !slow: --hypothesis-profile=fast \ 18 | slow: --hypothesis-profile=slow \ 19 | --cov=swh/indexer \ 20 | --cov-branch \ 21 | swh/indexer \ 22 | {posargs} 23 | 24 | [testenv:black] 25 | skip_install = true 26 | deps = 27 | black==25.1.0 28 | commands = 29 | {envpython} -m black --check swh 30 | 31 | [testenv:flake8] 32 | skip_install = true 33 | deps = 34 | flake8==7.1.1 35 | flake8-bugbear==24.12.12 36 | flake8-pyproject==1.2.3 37 | pycodestyle==2.12.1 38 | 39 | commands = 40 | {envpython} -m flake8 41 | 42 | [testenv:mypy] 43 | extras = 44 | testing 45 | deps = 46 | mypy==1.15.0 47 | commands = 48 | mypy swh 49 | 50 | # build documentation outside swh-environment using the current 51 | # git HEAD of swh-docs, is executed on CI for each diff to prevent 52 | # breaking doc build 53 | [testenv:sphinx] 54 | allowlist_externals = make 55 | extras = 56 | testing 57 | deps = 58 | # fetch and install swh-docs 59 | git+https://gitlab.softwareheritage.org/swh/devel/swh-docs.git\#egg=swh.docs 60 | setenv = 61 | SWH_PACKAGE_DOC_TOX_BUILD = 1 62 | # turn warnings into errors 63 | SPHINXOPTS = -W 64 | commands = 65 | make -I {env_dir}/share/swh-docs -C docs 66 | --------------------------------------------------------------------------------