diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3e223cac1cb300df89c5eb5035b6dc4fbd475977..02181e7e0576348658117b476fbfe09340eca0d1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,19 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v4.3.0 hooks: - id: trailing-whitespace - id: check-json - id: check-yaml - - repo: https://gitlab.com/pycqa/flake8 - rev: 4.0.1 + - repo: https://github.com/pycqa/flake8 + rev: 5.0.4 hooks: - id: flake8 - additional_dependencies: [flake8-bugbear==22.3.23] + additional_dependencies: [flake8-bugbear==22.9.23] - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 + rev: v2.2.2 hooks: - id: codespell name: Check source code spelling @@ -31,11 +31,11 @@ repos: types: [python] - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.11.5 hooks: - id: isort - repo: https://github.com/python/black - rev: 22.3.0 + rev: 22.10.0 hooks: - id: black diff --git a/README.md b/README.md index f4f248183d29825cd197f8c3368968ba0499f7f4..56e255b0a6f4726f7936ed7b90b34629361b1a77 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,10 @@ swh-indexer Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - - ctags - - language - fossology-license - metadata -- revision: - - metadata +- origin: + - metadata (intrinsic, using the content indexer; and extrinsic) An indexer is in charge of: - looking up objects @@ -32,18 +30,13 @@ Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype -- language (queue swh_indexer_content_language): detect the - programming language - -- ctags (queue swh_indexer_content_ctags): compute tags information - - fossology-license (queue swh_indexer_fossology_license): compute the license -- metadata: translate file into translated_metadata dict +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta vocabulary) -Current revision indexers: +Current origin indexers: -- metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta and ForgeFed vocabularies) diff --git a/docs/dev-info.rst b/docs/dev-info.rst index 9ef8497b905308b41cb9574090977fc93ce8342a..4720098873a7569890053a41f18abcb8d68ad518 100644 --- a/docs/dev-info.rst +++ b/docs/dev-info.rst @@ -26,15 +26,9 @@ commands: .. code-block:: yaml indexers: - # language: - # batch_size: 10 - # check_presence: false fossology_license: batch_size: 10 check_presence: false - # ctags: - # batch_size: 2 - # check_presence: false - Mimetype indexer at ``~/.config/swh/indexer/mimetype.yml`` @@ -132,8 +126,6 @@ commands: - swh_indexer_orchestrator_content_all - swh_indexer_orchestrator_content_text - swh_indexer_content_mimetype - - swh_indexer_content_language - - swh_indexer_content_ctags - swh_indexer_content_fossology_license - swh_loader_svn_mount_and_load - swh_loader_git_express diff --git a/docs/index.rst b/docs/index.rst index 9cc3d625a9f3e49345239fe53a0d34d1ce8ba779..37623521eb23dbdf2d2404fa051d9ab1e52d094b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,6 +14,7 @@ information from archive source code artifacts. README.md dev-info.rst metadata-workflow.rst + swhpkg.rst mesocore.rst @@ -24,4 +25,12 @@ Reference Documentation :maxdepth: 2 cli - /apidoc/swh.indexer + +.. only:: standalone_package_doc + + Indices and tables + ------------------ + + * :ref:`genindex` + * :ref:`modindex` + * :ref:`search` diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst index 4d99106134c484a89b5ed7bbe88f394e85af613c..96bf24f29c85fde26fdb22a6db2ddeab0d10447b 100644 --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -69,7 +69,11 @@ Translation from ecosystem-specific metadata to CodeMeta ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Intrinsic metadata is extracted from files provided with a project's source -code, and translated using `CodeMeta`_'s `crosswalk table`_. +code, and translated using `CodeMeta`_'s `crosswalk table`_; which is vendored +in :file:`swh/indexer/data/codemeta/codemeta.csv`. +Ecosystems not yet included in Codemeta's crosswalk have their own +:file:`swh/indexer/data/*.csv` file, with one row for each CodeMeta property, +even when not supported by the ecosystem. All input formats supported so far are straightforward dictionaries (eg. JSON) or can be accessed as such (eg. XML); and the first part of the translation is diff --git a/docs/swhpkg.rst b/docs/swhpkg.rst new file mode 100644 index 0000000000000000000000000000000000000000..bbec70e5f8c2ac6fcdc1359c473f7fe2cbe3814e --- /dev/null +++ b/docs/swhpkg.rst @@ -0,0 +1,117 @@ +SwhPkg Vocabulary +================================ + +.. note:: This is an early draft and hasn't been implemented yet + + +SwhPkg is a vocabulary that complements ontologies like schema.org and CodeMeta +in describing software projects. While the latter are meant to describe +source code projects, SwhPkg describes relationships between different packages released +by such projects. + +The namespace is ``https://www.softwareheritage.org/schema/2023/packages/``; +and it is meant to be used primarily alongside CodeMeta/schema.org +and ForgeFed/ActivityStreams. + + +The following prefixes are used throughout this document for readability: + +.. code-block:: json + + { + "schema": "http://schema.org/", + "codemeta": "https://codemeta.github.io/terms/", + "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/", + "swhpackages": "https://archive.softwareheritage.org/packages/", + } + +For example, here is a document using all three together: + +.. code-block:: json + + { + "@context": { + "schema": "http://schema.org/", + "codemeta": "https://codemeta.github.io/terms/", + "swhpkg": "https://www.softwareheritage.org/schema/2023/packages/", + "swhpackages": "https://archive.softwareheritage.org/packages/", + "package": {"@id": "swhpkg:package", "@type": "@id"}, + "release": {"@id": "swhpkg:release", "@type": "@id"}, + "dependencies": {"@id": "swhpkg:dependencies"}, + "dependency": {"@id": "swhpkg:dependency", "@type": "@id"}, + "dependent": {"@id": "swhpkg:dependent", "@type": "@id"}, + "kind": {"@id": "swhpkg:kind"}, + "optional": {"@id": "swhpkg:optional"} + }, + "@type": "schema:SoftwareSourceCode", + "@id": "https://npmjs.com/package/d3@7.8.2", + "package": "swhpackages:js/d3", + "release": "swhpackages:js/d3@7.8.2", + "schema:name": "d3", + "schema:version": "7.8.2", + "schema:description": "Data-Driven Documents", + "dependencies": [ + { + "@type": "swhpkg:dependencies", + "@id": "swhpackages:js/d3@7.8.2#d3-array", + "dependent": "swhpackages:js/d3@7.8.2", + "dependency": "swhpackages:js/d3-array", + "constraint": "^3.0.0", + "kind": "runtime", + "optional": false + }, + { + "@type": "swhpkg:dependencies", + "@id": "swhpackages:js/d3@7.8.2#mocha", + "dependent": "swhpackages:js/d3@7.8.2", + "dependency": "swhpackages:js/mocha", + "constraint": ">10.0.0", + "kind": "development", + "optional": true + } + ] + } + +SwhPkg Terms +------------- + +.. list-table:: + :header-rows: 1 + + * - Property + - Type + - Examples + - Description + * - ``package`` + - ``swhpkg:package`` + - ``swhpackages:js/d3``, ``swhpackages:python/numpy`` + - Package that is released by the SoftwareSourceCode/SofwtareApplication. + * - ``release`` + - ``swhpkg:release`` + - ``swhpackages:js/d3@7.8.2``, ``swhpackages:python/numpy@1.24.2`` + - Specific version of the package that is released by the SoftwareSourceCode/SoftwareApplication + * - ``dependencies`` + - ``swhpkg:dependencies`` + - d3 depends on d3-array and mocha. + - Dependencies of the project. There can be many of them. + * - ``dependent`` + - ``swhpkg:release`` + - ``swhpkg:js/d3`` + - A reference to the package release that depends on the dependency. + * - ``dependency`` + - ``swhpkg:package`` + - ``swhpkg:js/d3``, ``swhpkg:python/django`` + - A reference to the package that is depended on. + * - ``constraint`` + - Text + - ``^3.0.0``, ``>10.0.0`` + - The constraint on a dependency relation. It can be a version range, or a git commit hash, or even a file path. + * - ``kind`` + - Text + - ``runtime``, ``development`` + - The type of dependency relation. Some common values are ``runtime``, ``development``. + * - ``optional`` + - boolean + - ``true``, ``false`` + - Whether the dependency is optional or not. + diff --git a/mypy.ini b/mypy.ini index d63e78953bd4973585a369650d034112c3e17408..28c26fbae6feb0024ef68459574c13312e3657de 100644 --- a/mypy.ini +++ b/mypy.ini @@ -11,6 +11,9 @@ ignore_missing_imports = True [mypy-confluent_kafka.*] ignore_missing_imports = True +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-magic.*] ignore_missing_imports = True diff --git a/requirements-swh.txt b/requirements-swh.txt index 52654a75308d2104402e4a5eaab50c1067462440..0f868e0e63b54acb8365ef596d8ea5b1ff852856 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ -swh.core[db,http] >= 2.9 +swh.core[db,http] >= 2.20.0 swh.model >= 0.0.15 swh.objstorage >= 0.2.2 swh.scheduler >= 0.5.2 diff --git a/requirements.txt b/requirements.txt index 4dd61a2c280cd34b5ddefd2ae1204e7af8b9aa87..1cfc8ea75d4c3474fa3886ade2458c59753b7fc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ click # the version 2.1.2 is causing segmentation faults # cf https://forge.softwareheritage.org/T3815 frozendict != 2.1.2 +iso8601 pyld rdflib sentry-sdk diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index f5c8889e6cb4114ab8604118f768e141476f2a1c..939b4b1fc8fc83854925963018ad0eefc0f55347 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -247,6 +247,13 @@ def schedule_origin_metadata_reindex( type=int, help="Maximum number of objects to replay. Default is to run forever.", ) +@click.option( + "--batch-size", + "-b", + default=None, + type=int, + help="Batch size. Default is 200.", +) @click.pass_context def journal_client( ctx, @@ -257,6 +264,7 @@ def journal_client( prefix: str, group_id: str, stop_after_objects: Optional[int], + batch_size: Optional[int], ): """ Listens for new objects from the SWH Journal, and either: @@ -280,16 +288,22 @@ def journal_client( scheduler = _get_api(get_scheduler, cfg, "scheduler", scheduler_url) - brokers = brokers or journal_cfg.get("brokers") - if not brokers: + if brokers: + journal_cfg["brokers"] = brokers + if not journal_cfg.get("brokers"): raise ValueError("The brokers configuration is mandatory.") - prefix = prefix or journal_cfg.get("prefix") - group_id = group_id or journal_cfg.get("group_id") + if prefix: + journal_cfg["prefix"] = prefix + if group_id: + journal_cfg["group_id"] = group_id origin_metadata_task_type = origin_metadata_task_type or journal_cfg.get( "origin_metadata_task_type" ) - stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects") + if stop_after_objects: + journal_cfg["stop_after_objects"] = stop_after_objects + if batch_size: + journal_cfg["batch_size"] = batch_size object_types = set() worker_fns: List[Callable[[ObjectsDict], Dict]] = [] @@ -350,11 +364,8 @@ def journal_client( client = get_journal_client( cls="kafka", - brokers=brokers, - prefix=prefix, - group_id=group_id, object_types=list(object_types), - stop_after_objects=stop_after_objects, + **journal_cfg, ) def worker_fn(objects: ObjectsDict): diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py index f1d00b1461172379ebe31c73f08aa52102bf4599..d7ddb72d72c420afa98bbe981e28603ebc163c96 100644 --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -9,7 +9,7 @@ import itertools import json import os.path import re -from typing import Any, List +from typing import Any, Dict, List, Set, TextIO, Tuple from pyld import jsonld import rdflib @@ -66,7 +66,15 @@ def make_absolute_uri(local_name): return uri -def _read_crosstable(fd): +def read_crosstable(fd: TextIO) -> Tuple[Set[str], Dict[str, Dict[str, rdflib.URIRef]]]: + """ + Given a file-like object to a `CodeMeta crosswalk table` (either the main + cross-table with all columns, or an auxiliary table with just the CodeMeta + column and one ecosystem-specific table); returns a list of all CodeMeta + terms, and a dictionary ``{ecosystem: {ecosystem_term: codemeta_term}}`` + + .. _CodeMeta crosswalk table: <https://codemeta.github.io/crosswalk/ + """ reader = csv.reader(fd) try: header = next(reader) @@ -75,7 +83,9 @@ def _read_crosstable(fd): data_sources = set(header) - {"Parent Type", "Property", "Type", "Description"} - codemeta_translation = {data_source: {} for data_source in data_sources} + codemeta_translation: Dict[str, Dict[str, rdflib.URIRef]] = { + data_source: {} for data_source in data_sources + } terms = set() for line in reader: # For each canonical name @@ -101,7 +111,7 @@ def _read_crosstable(fd): with open(CROSSWALK_TABLE_PATH) as fd: - (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, CROSSWALK_TABLE) = read_crosstable(fd) def _document_loader(url, options=None): diff --git a/swh/indexer/data/Gitea.csv b/swh/indexer/data/Gitea.csv new file mode 100644 index 0000000000000000000000000000000000000000..4fe89fe07bdb9c583f3a67d89c040ee53f2b021b --- /dev/null +++ b/swh/indexer/data/Gitea.csv @@ -0,0 +1,68 @@ +Property,Gitea +codeRepository,clone_url +programmingLanguage,languages +runtimePlatform, +targetProduct, +applicationCategory, +applicationSubCategory, +downloadUrl, +fileSize, +installUrl, +memoryRequirements, +operatingSystem, +permissions, +processorRequirements, +releaseNotes, +softwareHelp, +softwareRequirements, +softwareVersion, +storageRequirements, +supportingData, +author,owner +citation, +contributor, +copyrightHolder, +copyrightYear, +dateCreated,created_at +dateModified,updated_at +datePublished, +editor, +encoding, +fileFormat, +funder, +keywords, +license, +producer, +provider, +publisher, +sponsor, +version, +isAccessibleForFree, +isPartOf, +hasPart, +position, +description,description +identifier, +name,name +sameAs, +url,website +relatedLink, +givenName, +familyName, +email, +affiliation, +identifier, +name,name +address, +type, +id, +softwareSuggestions, +maintainer, +contIntegration, +buildInstructions, +developmentStatus, +embargoDate, +funding, +issueTracker, +referencePublication, +readme, diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 65f730c5e6105fc4129726ba17b75864e4f4ad42..fbc0e1f5d3e9b2424a5426407e17e2c82db595f0 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -344,6 +344,9 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): sentry_sdk.capture_exception() summary["status"] = "failed" return summary + else: + # Reset tag after we finished processing the given content + sentry_sdk.set_tag("swh-indexer-content-sha1", "") summary_persist = self.persist_index_computations(results) self.results = results @@ -406,6 +409,9 @@ class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): self.log.exception("Problem when reading contents metadata.") sentry_sdk.capture_exception() summary["status"] = "failed" + else: + # Reset tag after we finished processing the given content + sentry_sdk.set_tag("swh-indexer-content-sha1", "") return summary @@ -493,6 +499,7 @@ class ContentPartitionIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult continue sentry_sdk.set_tag("swh-indexer-content-sha1", sha1) yield from self.index(sha1, raw_content, **kwargs) + sentry_sdk.set_tag("swh-indexer-content-sha1", "") def _index_with_skipping_already_done( self, partition_id: int, nb_partitions: int @@ -642,6 +649,7 @@ class OriginIndexer(BaseIndexer[str, None, TResult], Generic[TResult]): for origin in origins: sentry_sdk.set_tag("swh-indexer-origin-url", origin.url) results.extend(self.index(origin.url, **kwargs)) + sentry_sdk.set_tag("swh-indexer-origin-url", "") return results @@ -710,6 +718,8 @@ class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult self.log.exception("Problem when processing directory") sentry_sdk.capture_exception() summary["status"] = "failed" + else: + sentry_sdk.set_tag("swh-indexer-directory-swhid", "") summary_persist = self.persist_index_computations(results) if summary_persist: diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 960b6a5c228bd21513b5ab279170e4f3cff01ede..5a7a25c8e6102541783344bdd5eb6c59930a7060 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -4,6 +4,9 @@ # See top-level LICENSE file for more information from copy import deepcopy +import hashlib +import logging +import time from typing import ( Any, Callable, @@ -18,6 +21,7 @@ from typing import ( ) from urllib.parse import urlparse +import pkg_resources import sentry_sdk from swh.core.config import merge_configs @@ -55,6 +59,8 @@ ORIGIN_GET_BATCH_SIZE = 10 T1 = TypeVar("T1") T2 = TypeVar("T2") +logger = logging.getLogger(__name__) + def call_with_batches( f: Callable[[List[T1]], Iterable[T2]], @@ -73,21 +79,20 @@ class ExtrinsicMetadataIndexer( def process_journal_objects(self, objects: ObjectsDict) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} try: - results = [] + results = {} for item in objects.get("raw_extrinsic_metadata", []): - # Drop attribute 'type' (from older model versions) no longer allowed. - item.pop("type", None) remd = RawExtrinsicMetadata.from_dict(item) - sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid()) - results.extend(self.index(remd.id, data=remd)) + sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid())) + for result in self.index(remd.id, data=remd): + results[result.id] = result except Exception: if not self.catch_exceptions: raise summary["status"] = "failed" return summary - summary_persist = self.persist_index_computations(results) - self.results = results + self.results = list(results.values()) + summary_persist = self.persist_index_computations(self.results) if summary_persist: for value in summary_persist.values(): if value > 0: @@ -105,11 +110,18 @@ class ExtrinsicMetadataIndexer( raise NotImplementedError( "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data" ) - if data.target.object_type != ExtendedObjectType.ORIGIN: + if data.target.object_type == ExtendedObjectType.ORIGIN: + origin_sha1 = data.target.object_id + elif data.origin is not None: + # HACK: As swh-search does (yet?) not support searching on directories + # and traversing back to origins, we index metadata on non-origins with + # an origin context as if they were on the origin itself. + origin_sha1 = hashlib.sha1(data.origin.encode()).digest() + else: # other types are not supported yet return [] - if data.authority.type != MetadataAuthorityType.FORGE: + if data.authority.type == MetadataAuthorityType.REGISTRY: # metadata provided by a third-party; don't trust it # (technically this could be handled below, but we check it here # to return early; sparing a translation and origin lookup) @@ -131,12 +143,21 @@ class ExtrinsicMetadataIndexer( return [] # TODO: batch requests to origin_get_by_sha1() - origins = self.storage.origin_get_by_sha1([data.target.object_id]) - try: - (origin,) = origins - if origin is None: - raise ValueError() - except ValueError: + for _ in range(6): + origins = self.storage.origin_get_by_sha1([origin_sha1]) + try: + (origin,) = origins + if origin is not None: + break + except ValueError: + pass + # The origin does not exist. This may be due to some replication lag + # between the loader's DB/journal and the DB we are consuming from. + # Wait a bit and try again + logger.debug("Origin %s not found, sleeping for 10s.", data.target) + time.sleep(10) + else: + # Does not exist, or replication lag > 60s. raise ValueError(f"Unknown origin {data.target}") from None if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc: @@ -239,8 +260,8 @@ class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): DEFAULT_CONFIG: Dict[str, Any] = { "tools": { - "name": "swh-metadata-detector", - "version": "0.0.2", + "name": "swh.indexer.metadata", + "version": pkg_resources.get_distribution("swh.indexer").version, "configuration": {}, }, } @@ -356,23 +377,20 @@ class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): """ metadata = [] - tool = { - "name": "swh-metadata-translator", - "version": "0.0.2", - "configuration": {}, - } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content - config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]} - config["tools"] = [tool] + config = { + k: self.config[k] + for k in [INDEXER_CFG_KEY, "objstorage", "storage", "tools"] + } all_detected_files = detect_metadata(files) used_mappings = [ INTRINSIC_MAPPINGS[context].name for context in all_detected_files ] for (mapping_name, detected_files) in all_detected_files.items(): cfg = deepcopy(config) - cfg["tools"][0]["configuration"]["context"] = mapping_name + cfg["tools"]["configuration"]["context"] = mapping_name c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] @@ -523,25 +541,27 @@ class OriginMetadataIndexer( results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]], ) -> Dict[str, int]: # Deduplicate directories - dir_metadata: List[DirectoryIntrinsicMetadataRow] = [] - orig_metadata: List[OriginIntrinsicMetadataRow] = [] + dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {} + orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {} summary: Dict = {} for (orig_item, dir_item) in results: assert dir_item.metadata == orig_item.metadata if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets - if dir_item not in dir_metadata: - dir_metadata.append(dir_item) - if orig_item not in orig_metadata: - orig_metadata.append(orig_item) + if dir_item.id not in dir_metadata: + dir_metadata[dir_item.id] = dir_item + if orig_item.id not in orig_metadata: + orig_metadata[orig_item.id] = orig_item if dir_metadata: summary_dir = self.idx_storage.directory_intrinsic_metadata_add( - dir_metadata + list(dir_metadata.values()) ) summary.update(summary_dir) if orig_metadata: - summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata) + summary_ori = self.idx_storage.origin_intrinsic_metadata_add( + list(orig_metadata.values()) + ) summary.update(summary_ori) return summary diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py index 99c2504c2a9cb9333185fa273b27a88f4b859a74..715362418efaf7f0b82b1e47cb507a4ec49816dc 100644 --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -8,7 +8,19 @@ from typing import Dict, Type import click -from . import cff, codemeta, composer, dart, github, maven, npm, nuget, python, ruby +from . import ( + cff, + codemeta, + composer, + dart, + gitea, + github, + maven, + npm, + nuget, + python, + ruby, +) from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { @@ -24,6 +36,7 @@ INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { } EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { + "GiteaMapping": gitea.GiteaMapping, "GitHubMapping": github.GitHubMapping, "JsonSwordCodemetaMapping": codemeta.JsonSwordCodemetaMapping, "SwordCodemetaMapping": codemeta.SwordCodemetaMapping, diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py index 418c2ecb82f8f2779a6ef880ed554ffce3263305..e992bb521468ed8482e22593abcfbf7493a958f3 100644 --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -5,7 +5,7 @@ import json import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar +from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, TypeVar, Union import uuid import xml.parsers.expat @@ -19,6 +19,11 @@ from swh.indexer.codemeta import _document_loader, compact from swh.indexer.namespaces import RDF, SCHEMA from swh.indexer.storage.interface import Sha1 +from .utils import add_url_if_valid + +TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" +"""Prefix used to generate temporary URIs for root nodes being translated.""" + class DirectoryLsEntry(TypedDict): target: Sha1 @@ -126,16 +131,21 @@ class BaseIntrinsicMapping(BaseMapping): class SingleFileIntrinsicMapping(BaseIntrinsicMapping): """Base class for all intrinsic metadata mappings that use a single file as input.""" - @property - def filename(self): - """The .json file to extract metadata from.""" - raise NotImplementedError(f"{self.__class__.__name__}.filename") + filename: Union[bytes, Pattern[bytes]] @classmethod def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: - for entry in file_entries: - if entry["name"].lower() == cls.filename: - return [entry["sha1"]] + filename = cls.filename + # Check if filename is a regex or bytes: + if isinstance(filename, bytes): + for entry in file_entries: + if entry["name"].lower() == filename: + return [entry["sha1"]] + else: + for entry in file_entries: + if filename.match(entry["name"]): + return [entry["sha1"]] + return [] @@ -147,6 +157,10 @@ class DictMapping(BaseMapping): """List of fields that are simple strings, and don't need any normalization.""" + date_fields: List[str] = [] + """List of fields that are strings that should be typed as http://schema.org/Date + """ + uri_fields: List[str] = [] """List of fields that are simple URIs, and don't need any normalization.""" @@ -166,7 +180,7 @@ class DictMapping(BaseMapping): simple_terms = { str(term) for (key, term) in cls.mapping.items() - if key in cls.string_fields + cls.uri_fields + if key in cls.string_fields + cls.date_fields + cls.uri_fields or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) } @@ -180,6 +194,21 @@ class DictMapping(BaseMapping): return simple_terms | complex_terms + def get_root_uri(self, content_dict: Dict) -> rdflib.URIRef: + """Returns an URI for the SoftwareSourceCode or Repository being described. + + The default implementation uses a temporary URI that is stripped before + normalization by :meth:`_translate_dict`. + """ + # The main object being described (the SoftwareSourceCode) does not necessarily + # may or may not have an id. + # If it does, it will need to be set by a subclass. + # If it doesn't we temporarily use this URI to identify it. Unfortunately, + # we cannot use a blank node as we need to use it for JSON-LD framing later, + # and blank nodes cannot be used for framing in JSON-LD >= 1.1 + root_id = TMP_ROOT_URI_PREFIX + str(uuid.uuid4()) + return rdflib.URIRef(root_id) + def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: """ Translates content by parsing content from a dict object @@ -195,16 +224,47 @@ class DictMapping(BaseMapping): """ graph = rdflib.Graph() - # The main object being described (the SoftwareSourceCode) does not necessarily - # may or may not have an id. - # Either way, we temporarily use this URI to identify it. Unfortunately, - # we cannot use a blank node as we need to use it for JSON-LD framing later, - # and blank nodes cannot be used for framing in JSON-LD >= 1.1 - root_id = ( - "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" - + str(uuid.uuid4()) + root = self.get_root_uri(content_dict) + + self._translate_to_graph(graph, root, content_dict) + + self.sanitize(graph) + + # Convert from rdflib's internal graph representation to JSON + s = graph.serialize(format="application/ld+json") + + # Load from JSON to a list of Python objects + jsonld_graph = json.loads(s) + + # Use JSON-LD framing to turn the graph into a rooted tree + # frame = {"@type": str(SCHEMA.SoftwareSourceCode)} + translated_metadata = jsonld.frame( + jsonld_graph, + {"@id": str(root)}, + options={ + "documentLoader": _document_loader, + "processingMode": "json-ld-1.1", + }, ) - root = rdflib.URIRef(root_id) + + # Remove the temporary id we added at the beginning + assert isinstance(translated_metadata["@id"], str) + if translated_metadata["@id"].startswith(TMP_ROOT_URI_PREFIX): + del translated_metadata["@id"] + + return self.normalize_translation(translated_metadata) + + def _translate_to_graph( + self, graph: rdflib.Graph, root: rdflib.term.Identifier, content_dict: Dict + ) -> None: + """ + Translates content by parsing content from a dict object + and translating with the appropriate mapping to the graph passed as parameter + + Args: + content_dict (dict): content dict to translate + + """ graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode)) for k, v in content_dict.items(): @@ -231,53 +291,54 @@ class DictMapping(BaseMapping): pass elif isinstance(v, list): for item in reversed(v): - graph.add((root, codemeta_key, item)) + if isinstance(item, rdflib.URIRef): + add_url_if_valid(graph, root, codemeta_key, str(item)) + else: + graph.add((root, codemeta_key, item)) else: - graph.add((root, codemeta_key, v)) + if isinstance(v, rdflib.URIRef): + add_url_if_valid(graph, root, codemeta_key, str(v)) + else: + graph.add((root, codemeta_key, v)) elif k in self.string_fields and isinstance(v, str): graph.add((root, codemeta_key, rdflib.Literal(v))) elif k in self.string_fields and isinstance(v, list): for item in v: graph.add((root, codemeta_key, rdflib.Literal(item))) + elif k in self.date_fields and isinstance(v, str): + typed_v = rdflib.Literal(v, datatype=SCHEMA.Date) + graph.add((root, codemeta_key, typed_v)) + elif k in self.date_fields and isinstance(v, list): + for item in v: + if isinstance(item, str): + typed_item = rdflib.Literal(item, datatype=SCHEMA.Date) + graph.add((root, codemeta_key, typed_item)) elif k in self.uri_fields and isinstance(v, str): - graph.add((root, codemeta_key, rdflib.URIRef(v))) + add_url_if_valid(graph, root, codemeta_key, v) elif k in self.uri_fields and isinstance(v, list): for item in v: - if isinstance(item, str): - graph.add((root, codemeta_key, rdflib.URIRef(item))) + add_url_if_valid(graph, root, codemeta_key, item) else: continue self.extra_translation(graph, root, content_dict) - # Convert from rdflib's internal graph representation to JSON - s = graph.serialize(format="application/ld+json") + def sanitize(self, graph: rdflib.Graph) -> None: + # Remove triples that make PyLD crash + for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): + graph.remove((subject, predicate, rdflib.URIRef(""))) - # Load from JSON to a list of Python objects - jsonld_graph = json.loads(s) - - # Use JSON-LD framing to turn the graph into a rooted tree - # frame = {"@type": str(SCHEMA.SoftwareSourceCode)} - translated_metadata = jsonld.frame( - jsonld_graph, - {"@id": root_id}, - options={ - "documentLoader": _document_loader, - "processingMode": "json-ld-1.1", - }, - ) - - # Remove the temporary id we added at the beginning - if isinstance(translated_metadata["@id"], list): - translated_metadata["@id"].remove(root_id) - else: - del translated_metadata["@id"] - - return self.normalize_translation(translated_metadata) + # Should not happen, but we's better check as this may lead to incorrect data + invalid = False + for triple in graph.triples((rdflib.URIRef(""), None, None)): + invalid = True + logging.error("Empty triple subject URI: %r", triple) + if invalid: + raise ValueError("Empty triple subject(s)") def extra_translation( self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any] - ): + ) -> None: """Called at the end of the translation process, and may add arbitrary triples to ``graph`` based on the input dictionary (passed as ``d``). """ @@ -332,14 +393,14 @@ class SafeLoader(yaml.SafeLoader): } -class YamlMapping(DictMapping, SingleFileIntrinsicMapping): +class YamlMapping(DictMapping): """Base class for all mappings that use Yaml data as input.""" def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]: raw_content_string: str = raw_content.decode() try: content_dict = yaml.load(raw_content_string, Loader=SafeLoader) - except yaml.scanner.ScannerError: + except (yaml.scanner.ScannerError, yaml.parser.ParserError): return None if isinstance(content_dict, dict): diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py index 12121cc0293b90b328f4c1eadbdaae236c1cb402..0d730e883af02061214ff4ea22e623a99f4b79d7 100644 --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from typing import List +import urllib.parse from rdflib import BNode, Graph, Literal, URIRef import rdflib.term @@ -11,25 +12,30 @@ import rdflib.term from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.namespaces import RDF, SCHEMA -from .base import YamlMapping +from .base import SingleFileIntrinsicMapping, YamlMapping from .utils import add_map DOI = URIRef("https://doi.org/") SPDX = URIRef("https://spdx.org/licenses/") -class CffMapping(YamlMapping): +class CffMapping(YamlMapping, SingleFileIntrinsicMapping): """Dedicated class for Citation (CITATION.cff) mapping and translation""" name = "cff" filename = b"CITATION.cff" mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"] string_fields = ["keywords", "license", "abstract", "version", "doi"] + date_fields = ["date-released"] uri_fields = ["repository-code"] def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node: node: rdflib.term.Node - if "orcid" in author and isinstance(author["orcid"], str): + if ( + "orcid" in author + and isinstance(author["orcid"], str) + and urllib.parse.urlparse(author["orcid"]).netloc + ): node = URIRef(author["orcid"]) else: node = BNode() @@ -57,7 +63,3 @@ class CffMapping(YamlMapping): def normalize_license(self, s: str) -> URIRef: if isinstance(s, str): return SPDX + s - - def normalize_date_released(self, s: str) -> Literal: - if isinstance(s, str): - return Literal(s, datatype=SCHEMA.Date) diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py index 4da5eb6a9512c60c8c86795f9d3eba1df1fbb16c..1fc613f74a6c0345179f0aee72c0ecd0b7957ac8 100644 --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -5,10 +5,12 @@ import collections import json +import logging import re -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import xml.etree.ElementTree as ET +import iso8601 import xmltodict from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand @@ -19,6 +21,9 @@ ATOM_URI = "http://www.w3.org/2005/Atom" _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) +_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") + +logger = logging.getLogger(__name__) class CodemetaMapping(SingleFileIntrinsicMapping): @@ -61,8 +66,13 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): def supported_terms(cls) -> List[str]: return [term for term in CODEMETA_TERMS if not term.startswith("@")] - def xml_to_jsonld(self, e: ET.Element) -> Dict[str, Any]: - doc: Dict[str, List[Dict[str, Any]]] = collections.defaultdict(list) + def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: + # Keys are JSON-LD property names (URIs or terms). + # Values are either a single string (if key is "type") or list of + # other dicts with the same type recursively. + # To simply annotations, we omit the single string case here. + doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) + for child in e: m = _TAG_RE.match(child.tag) assert m, f"Tag with no namespace: {child}" @@ -83,7 +93,42 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): # It is a term defined by the context; write is as-is and JSON-LD # expansion will convert it to a full URI based on # "@context": CODEMETA_CONTEXT_URL - doc[localname].append(self.xml_to_jsonld(child)) + jsonld_child = self.xml_to_jsonld(child) + if ( + localname + in ( + "dateCreated", + "dateModified", + "datePublished", + ) + and isinstance(jsonld_child, str) + and _DATE_RE.match(jsonld_child) + ): + # Dates missing a leading zero for their day/month, used + # to be allowed by the deposit; so we need to reformat them + # to be valid ISO8601. + jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() + if localname == "id": + # JSON-LD only allows a single id, and they have to be strings. + if localname in doc: + logger.error( + "Duplicate <id>s in SWORD document: %r and %r", + doc[localname], + jsonld_child, + ) + continue + elif not jsonld_child: + logger.error("Empty <id> value in SWORD document") + continue + elif not isinstance(jsonld_child, str): + logger.error( + "Unexpected <id> value in SWORD document: %r", jsonld_child + ) + continue + else: + doc[localname] = jsonld_child # type: ignore[assignment] + else: + doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) @@ -95,7 +140,7 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): text = e.text.strip() if e.text else None if text: # TODO: check doc is empty, and raise mixed-content error otherwise? - doc_["@value"] = text + return text return doc_ @@ -106,6 +151,8 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): # Transform to JSON-LD document doc = self.xml_to_jsonld(root) + assert isinstance(doc, dict), f"Root object is not a dict: {doc}" + # Add @context to JSON-LD expansion replaces the "codemeta:" prefix # hash (which uses the context URL as namespace URI for historical # reasons) into properties in `http://schema.org/` and diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py index a43fc23ea777320528a9cab030dd060e280ccbee..0c9b08b4e6eaff49e3bccb76999671280200532c 100644 --- a/swh/indexer/metadata_dictionary/composer.py +++ b/swh/indexer/metadata_dictionary/composer.py @@ -8,7 +8,7 @@ from typing import Optional from rdflib import BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import RDF, SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping @@ -20,7 +20,7 @@ SPDX = URIRef("https://spdx.org/licenses/") COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv") with open(COMPOSER_TABLE_PATH) as fd: - (CODEMETA_TERMS, COMPOSER_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, COMPOSER_TABLE) = read_crosstable(fd) class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping): diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py index ec6dfb26186de263d5a0115afe64712dca362121..01f28c7cc866380a79dd0157cbf626cc8f6e65a4 100644 --- a/swh/indexer/metadata_dictionary/dart.py +++ b/swh/indexer/metadata_dictionary/dart.py @@ -8,10 +8,10 @@ import re from rdflib import RDF, BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import SCHEMA -from .base import YamlMapping +from .base import SingleFileIntrinsicMapping, YamlMapping from .utils import add_map SPDX = URIRef("https://spdx.org/licenses/") @@ -19,7 +19,7 @@ SPDX = URIRef("https://spdx.org/licenses/") PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv") with open(PUB_TABLE_PATH) as fd: - (CODEMETA_TERMS, PUB_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, PUB_TABLE) = read_crosstable(fd) def name_to_person(name): @@ -29,7 +29,7 @@ def name_to_person(name): } -class PubspecMapping(YamlMapping): +class PubspecMapping(YamlMapping, SingleFileIntrinsicMapping): name = "pubspec" filename = b"pubspec.yaml" diff --git a/swh/indexer/metadata_dictionary/gitea.py b/swh/indexer/metadata_dictionary/gitea.py new file mode 100644 index 0000000000000000000000000000000000000000..4f6e648d02831b54ab232af1ed72e11eb791886d --- /dev/null +++ b/swh/indexer/metadata_dictionary/gitea.py @@ -0,0 +1,124 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from typing import Any, Tuple + +from rdflib import RDF, BNode, Graph, Literal, URIRef + +from swh.indexer.codemeta import _DATA_DIR, read_crosstable +from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA + +from .base import BaseExtrinsicMapping, JsonMapping, produce_terms +from .utils import prettyprint_graph # noqa + +SPDX = URIRef("https://spdx.org/licenses/") + + +GITEA_TABLE_PATH = os.path.join(_DATA_DIR, "Gitea.csv") + +with open(GITEA_TABLE_PATH) as fd: + (CODEMETA_TERMS, GITEA_TABLE) = read_crosstable(fd) + + +class GiteaMapping(BaseExtrinsicMapping, JsonMapping): + name = "gitea" + mapping = GITEA_TABLE["Gitea"] + uri_fields = [ + "website", + "clone_url", + ] + date_fields = [ + "created_at", + "updated_at", + ] + string_fields = [ + "name", + "full_name", + "languages", + "description", + ] + + @classmethod + def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: + return ("gitea-project-json", "gogs-project-json") + + def extra_translation(self, graph, root, content_dict): + graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode)) + graph.add((root, RDF.type, FORGEFED.Repository)) + + def get_root_uri(self, content_dict: dict) -> URIRef: + if isinstance(content_dict.get("html_url"), str): + return URIRef(content_dict["html_url"]) + else: + raise ValueError( + f"Gitea/Gogs metadata has invalid/missing html_url: {content_dict}" + ) + + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) + def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None: + """ + + >>> graph = Graph() + >>> root = URIRef("http://example.org/test-software") + >>> GiteaMapping().translate_forks_count(graph, root, 42) + >>> prettyprint_graph(graph, root) + { + "@id": ..., + "https://forgefed.org/ns#forks": { + "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection", + "https://www.w3.org/ns/activitystreams#totalItems": 42 + } + } + """ + if isinstance(v, int): + collection = BNode() + graph.add((root, FORGEFED.forks, collection)) + graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection)) + graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v))) + + @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems) + def translate_stars_count(self, graph: Graph, root: BNode, v: Any) -> None: + """ + + >>> graph = Graph() + >>> root = URIRef("http://example.org/test-software") + >>> GiteaMapping().translate_stars_count(graph, root, 42) + >>> prettyprint_graph(graph, root) + { + "@id": ..., + "https://www.w3.org/ns/activitystreams#likes": { + "@type": "https://www.w3.org/ns/activitystreams#Collection", + "https://www.w3.org/ns/activitystreams#totalItems": 42 + } + } + """ + if isinstance(v, int): + collection = BNode() + graph.add((root, ACTIVITYSTREAMS.likes, collection)) + graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection)) + graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v))) + + @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems) + def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None: + """ + + >>> graph = Graph() + >>> root = URIRef("http://example.org/test-software") + >>> GiteaMapping().translate_watchers_count(graph, root, 42) + >>> prettyprint_graph(graph, root) + { + "@id": ..., + "https://www.w3.org/ns/activitystreams#followers": { + "@type": "https://www.w3.org/ns/activitystreams#Collection", + "https://www.w3.org/ns/activitystreams#totalItems": 42 + } + } + """ + if isinstance(v, int): + collection = BNode() + graph.add((root, ACTIVITYSTREAMS.followers, collection)) + graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection)) + graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v))) diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py index fe3b87ee5e292a12b13e5f1dbb9c1fd49f49b2c8..0435c4154736132945a7d0982365d493c816ad56 100644 --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -8,25 +8,32 @@ from typing import Any, Tuple from rdflib import RDF, BNode, Graph, Literal, URIRef from swh.indexer.codemeta import CROSSWALK_TABLE -from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA +from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA from .base import BaseExtrinsicMapping, JsonMapping, produce_terms -from .utils import prettyprint_graph # noqa +from .utils import add_url_if_valid, prettyprint_graph # noqa SPDX = URIRef("https://spdx.org/licenses/") class GitHubMapping(BaseExtrinsicMapping, JsonMapping): name = "github" - mapping = CROSSWALK_TABLE["GitHub"] - string_fields = [ - "archive_url", + mapping = { + **CROSSWALK_TABLE["GitHub"], + "topics": SCHEMA.keywords, # TODO: submit this to the official crosswalk + "clone_url": SCHEMA.codeRepository, + } + uri_fields = [ + "clone_url", + ] + date_fields = [ "created_at", "updated_at", + ] + string_fields = [ "description", "full_name", - "html_url", - "issues_url", + "topics", ] @classmethod @@ -37,6 +44,22 @@ class GitHubMapping(BaseExtrinsicMapping, JsonMapping): graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode)) graph.add((root, RDF.type, FORGEFED.Repository)) + if content_dict.get("has_issues"): + add_url_if_valid( + graph, + root, + CODEMETA.issueTracker, + URIRef(content_dict["html_url"] + "/issues"), + ) + + def get_root_uri(self, content_dict: dict) -> URIRef: + if isinstance(content_dict.get("html_url"), str): + return URIRef(content_dict["html_url"]) + else: + raise ValueError( + f"GitHub metadata has missing/invalid html_url: {content_dict}" + ) + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None: """ diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py index a374a5e7b4dad7d3688314cf3fc178e7b63a30ef..5575ba9260a88e692c90d45fbb5981974a17ee2c 100644 --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -6,13 +6,13 @@ import os from typing import Any, Dict -from rdflib import Graph, Literal, URIRef +from rdflib import Graph, Literal from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.namespaces import SCHEMA from .base import SingleFileIntrinsicMapping, XmlMapping -from .utils import prettyprint_graph # noqa +from .utils import add_url_if_valid, prettyprint_graph # noqa class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): @@ -75,7 +75,10 @@ class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): and isinstance(artifact_id, str) ): repo = os.path.join(url, *group_id.split("."), artifact_id) - graph.add((root, SCHEMA.codeRepository, URIRef(repo))) + if "${" in repo: + # Often use as templating in pom.xml files collected from VCSs + return + add_url_if_valid(graph, root, SCHEMA.codeRepository, repo) def normalize_groupId(self, id_): """https://maven.apache.org/pom.html#Maven_Coordinates @@ -91,6 +94,7 @@ class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): >>> import xmltodict >>> import json + >>> from rdflib import URIRef >>> d = xmltodict.parse(''' ... <licenses> ... <license> @@ -155,5 +159,5 @@ class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): elif not isinstance(licenses, list): return for license in licenses: - if isinstance(license, dict) and isinstance(license.get("url"), str): - graph.add((root, SCHEMA.license, URIRef(license["url"]))) + if isinstance(license, dict): + add_url_if_valid(graph, root, SCHEMA.license, license.get("url")) diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py index 1540ef6ad4996f1a223db44f5c86d0530243f1c6..b838e5aef86363b582cb8d3be4cd7c739d629b0f 100644 --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import re -import urllib.parse from rdflib import RDF, BNode, Graph, Literal, URIRef @@ -12,7 +11,7 @@ from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.namespaces import SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping -from .utils import add_list, prettyprint_graph # noqa +from .utils import add_list, add_url_if_valid, prettyprint_graph # noqa SPDX = URIRef("https://spdx.org/licenses/") @@ -88,11 +87,13 @@ class NpmMapping(JsonMapping, SingleFileIntrinsicMapping): rdflib.term.URIRef('https://example.org/bugs/') """ if isinstance(d, dict) and isinstance(d.get("url"), str): - return URIRef(d["url"]) + url = d["url"] elif isinstance(d, str): - return URIRef(d) + url = d else: - return None + url = "" + + return URIRef(url) _parse_author = re.compile( r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$" @@ -185,12 +186,7 @@ class NpmMapping(JsonMapping, SingleFileIntrinsicMapping): graph.add((author, SCHEMA.name, Literal(name))) if email and isinstance(email, str): graph.add((author, SCHEMA.email, Literal(email))) - if url and isinstance(url, str): - # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop - # URLs that are blatantly invalid early, so PyLD does not crash. - parsed_url = urllib.parse.urlparse(url) - if parsed_url.netloc: - graph.add((author, SCHEMA.url, URIRef(url))) + add_url_if_valid(graph, author, SCHEMA.url, url) add_list(graph, root, SCHEMA.author, [author]) @@ -270,6 +266,16 @@ class NpmMapping(JsonMapping, SingleFileIntrinsicMapping): rdflib.term.URIRef('https://spdx.org/licenses/MIT') """ if isinstance(s, str): + if s.startswith("SEE LICENSE IN "): + # Very common pattern, because it is an example in the specification. + # It is followed by the filename; and the indexer architecture currently + # does not allow accessing that from metadata mappings. + # (Plus, an hypothetical license mapping would eventually pick it up) + return + if " " in s: + # Either an SPDX expression, or unusable data + # TODO: handle it + return return SPDX + s def normalize_keywords(self, lst): diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py index 62f7ea97e4bf22470c212b8faeef5d419323a4f6..6d52c4ac7a249e4e64e5938e169e2b597f3d3b69 100644 --- a/swh/indexer/metadata_dictionary/nuget.py +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -5,31 +5,31 @@ import os.path import re -from typing import Any, Dict, List +from typing import Any, Dict from rdflib import RDF, BNode, Graph, Literal, URIRef -from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, read_crosstable from swh.indexer.namespaces import SCHEMA -from swh.indexer.storage.interface import Sha1 -from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping -from .utils import add_list +from .base import SingleFileIntrinsicMapping, XmlMapping +from .utils import add_list, add_url_if_valid NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") with open(NUGET_TABLE_PATH) as fd: - (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, NUGET_TABLE) = read_crosstable(fd) SPDX = URIRef("https://spdx.org/licenses/") -class NuGetMapping(XmlMapping, BaseIntrinsicMapping): +class NuGetMapping(XmlMapping, SingleFileIntrinsicMapping): """ dedicated class for NuGet (.nuspec) mapping and translation """ name = "nuget" + filename = re.compile(rb".*\.nuspec") mapping = NUGET_TABLE["NuGet"] mapping["copyright"] = URIRef("http://schema.org/copyrightNotice") mapping["language"] = URIRef("http://schema.org/inLanguage") @@ -45,20 +45,13 @@ class NuGetMapping(XmlMapping, BaseIntrinsicMapping): ] uri_fields = ["projectUrl", "licenseUrl"] - @classmethod - def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: - for entry in file_entries: - if entry["name"].endswith(b".nuspec"): - return [entry["sha1"]] - return [] - def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]: return super()._translate_dict(d.get("package", {}).get("metadata", {})) def translate_repository(self, graph, root, v): if isinstance(v, dict) and isinstance(v["@url"], str): codemeta_key = URIRef(self.mapping["repository.url"]) - graph.add((root, codemeta_key, URIRef(v["@url"]))) + add_url_if_valid(graph, root, codemeta_key, v["@url"]) def normalize_license(self, v): if isinstance(v, dict) and v["@type"] == "expression": diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py index 71a0b104008d337f82455770063764a9e2ffee66..7031a0a2900ee5fbe3600c86df7b581794f468e2 100644 --- a/swh/indexer/metadata_dictionary/ruby.py +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -6,16 +6,13 @@ import ast import itertools import re -from typing import List from rdflib import RDF, BNode, Graph, Literal, URIRef from swh.indexer.codemeta import CROSSWALK_TABLE -from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.namespaces import SCHEMA -from swh.indexer.storage.interface import Sha1 -from .base import BaseIntrinsicMapping, DictMapping +from .base import DictMapping, SingleFileIntrinsicMapping from .utils import add_map SPDX = URIRef("https://spdx.org/licenses/") @@ -30,8 +27,9 @@ def name_to_person(graph: Graph, name): return author -class GemspecMapping(BaseIntrinsicMapping, DictMapping): +class GemspecMapping(DictMapping, SingleFileIntrinsicMapping): name = "gemspec" + filename = re.compile(rb".*\.gemspec") mapping = CROSSWALK_TABLE["Ruby Gem"] string_fields = ["name", "version", "description", "summary", "email"] uri_fields = ["homepage"] @@ -39,13 +37,6 @@ class GemspecMapping(BaseIntrinsicMapping, DictMapping): _re_spec_new = re.compile(r".*Gem::Specification.new +(do|\{) +\|.*\|.*") _re_spec_entry = re.compile(r"\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)") - @classmethod - def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: - for entry in file_entries: - if entry["name"].endswith(b".gemspec"): - return [entry["sha1"]] - return [] - def translate(self, raw_content): try: raw_content = raw_content.decode() diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py index 173b1461e31f4c5a4281aea9b5ca040c76b3a9f9..6aaf4fd587dd0f84df7d5291cee1c82f43e78c1d 100644 --- a/swh/indexer/metadata_dictionary/utils.py +++ b/swh/indexer/metadata_dictionary/utils.py @@ -5,7 +5,8 @@ import json -from typing import Callable, Iterable, Optional, Sequence, TypeVar +from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar +import urllib.parse from pyld import jsonld from rdflib import RDF, Graph, URIRef @@ -70,3 +71,46 @@ def add_map( """Helper for :func:`add_list` that takes a mapper function ``f``.""" nodes = [f(graph, value) for value in values] add_list(graph, subject, predicate, [node for node in nodes if node]) + + +def add_url_if_valid( + graph: Graph, + subject: rdflib.term.Node, + predicate: rdflib.term.Identifier, + url: Any, +) -> None: + """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed. + + This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91 + to drop URLs that are blatantly invalid early, so PyLD does not crash. + + >>> from pprint import pprint + >>> graph = Graph() + >>> subject = rdflib.term.URIRef("http://example.org/test-software") + >>> predicate = rdflib.term.URIRef("http://schema.org/license") + >>> add_url_if_valid( + ... graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt" + ... ) + >>> add_url_if_valid( + ... graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt" + ... ) + >>> add_url_if_valid( + ... graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt" + ... ) + >>> add_url_if_valid( + ... graph, subject, predicate, 42 + ... ) + >>> pprint(set(graph.triples((subject, predicate, None)))) + {(rdflib.term.URIRef('http://example.org/test-software'), + rdflib.term.URIRef('http://schema.org/license'), + rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))} + """ + if not isinstance(url, str): + return + try: + parsed_url = urllib.parse.urlparse(url) + except Exception: + return + if " " in url or not parsed_url.netloc: + return + graph.add((subject, predicate, rdflib.term.URIRef(url))) diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py index 2d9ff6dafe509eed9d269b2376a2327be1672e13..82ac133069f91f5300da610f9534204cd7a3ee5f 100644 --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -4,15 +4,16 @@ # See top-level LICENSE file for more information import re -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union -from swh.model.model import SnapshotBranch, TargetType +from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.storage.interface import PartialBranches, StorageInterface -def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]: +def get_head_swhid(storage: StorageInterface, origin_url: str) -> Optional[CoreSWHID]: """Returns the SWHID of the head revision or release of an origin""" visit_status = origin_get_latest_visit_status( storage, origin_url, allowed_statuses=["full"], require_snapshot=True @@ -20,14 +21,24 @@ def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]: if not visit_status: return None assert visit_status.snapshot is not None - snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) - if snapshot is None: - return None if visit_status.type == "ftp": - return _try_get_ftp_head(dict(snapshot.branches)) + # We need to fetch all branches in order to find the largest one + snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) + if snapshot is None: + return None + return _try_get_ftp_head(storage, snapshot) else: - return _try_get_head_generic(dict(snapshot.branches)) + # Peak into the snapshot, without fetching too many refs. + # If the snapshot is small, this gets all of it in a single request. + # If the snapshot is large, we will query specific branches as we need them. + partial_branches = storage.snapshot_get_branches( + visit_status.snapshot, branches_count=100 + ) + if partial_branches is None: + # Snapshot does not exist + return None + return _try_get_head_generic(storage, partial_branches) _archive_filename_re = re.compile( @@ -78,31 +89,56 @@ def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]: def _try_get_ftp_head( - branches: Dict[bytes, Optional[SnapshotBranch]] + storage: StorageInterface, snapshot: Snapshot ) -> Optional[CoreSWHID]: - archive_names = list(branches) + archive_names = list(snapshot.branches) max_archive_name = max(archive_names, key=_parse_version) - return _try_resolve_target(branches, max_archive_name) + return _try_resolve_target( + storage, + {"id": snapshot.id, "branches": dict(snapshot.branches), "next_branch": None}, + branch_name=max_archive_name, + ) def _try_get_head_generic( - branches: Dict[bytes, Optional[SnapshotBranch]] + storage: StorageInterface, partial_branches: PartialBranches ) -> Optional[CoreSWHID]: # Works on 'deposit', 'pypi', and VCSs. - return _try_resolve_target(branches, b"HEAD") or _try_resolve_target( - branches, b"master" - ) + return _try_resolve_target( + storage, partial_branches, branch_name=b"HEAD" + ) or _try_resolve_target(storage, partial_branches, branch_name=b"master") + + +def _get_branch( + storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes +) -> Optional[SnapshotBranch]: + """Given a ``branch_name``, gets it from ``partial_branches`` if present, + and fetches it from the storage otherwise.""" + if branch_name in partial_branches["branches"]: + return partial_branches["branches"][branch_name] + elif partial_branches["next_branch"] is not None: + # Branch is not in `partial_branches`, and `partial_branches` indeed partial + res = storage.snapshot_get_branches( + partial_branches["id"], branches_from=branch_name, branches_count=1 + ) + assert res is not None, "Snapshot does not exist anymore" + return res["branches"].get(branch_name) + else: + # Branch is not in `partial_branches`, but `partial_branches` is the full + # list of branches, which means it is a dangling reference. + return None def _try_resolve_target( - branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes + storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes ) -> Optional[CoreSWHID]: try: - branch = branches[branch_name] + branch = _get_branch(storage, partial_branches, branch_name) if branch is None: return None + while branch.target_type == TargetType.ALIAS: - branch = branches[branch.target] + branch = _get_branch(storage, partial_branches, branch.target) if branch is None: return None diff --git a/swh/indexer/sql/20-enums.sql b/swh/indexer/sql/20-enums.sql index a357eb51c8ac755ea2ef52fea18ba122da664769..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/swh/indexer/sql/20-enums.sql +++ b/swh/indexer/sql/20-enums.sql @@ -1,100 +0,0 @@ -create type languages as enum ( 'abap', 'abnf', 'actionscript', - 'actionscript-3', 'ada', 'adl', 'agda', 'alloy', 'ambienttalk', - 'antlr', 'antlr-with-actionscript-target', 'antlr-with-c#-target', - 'antlr-with-cpp-target', 'antlr-with-java-target', - 'antlr-with-objectivec-target', 'antlr-with-perl-target', - 'antlr-with-python-target', 'antlr-with-ruby-target', 'apacheconf', - 'apl', 'applescript', 'arduino', 'aspectj', 'aspx-cs', 'aspx-vb', - 'asymptote', 'autohotkey', 'autoit', 'awk', 'base-makefile', 'bash', - 'bash-session', 'batchfile', 'bbcode', 'bc', 'befunge', - 'blitzbasic', 'blitzmax', 'bnf', 'boo', 'boogie', 'brainfuck', - 'bro', 'bugs', 'c', 'c#', 'c++', 'c-objdump', 'ca65-assembler', - 'cadl', 'camkes', 'cbm-basic-v2', 'ceylon', 'cfengine3', - 'cfstatement', 'chaiscript', 'chapel', 'cheetah', 'cirru', 'clay', - 'clojure', 'clojurescript', 'cmake', 'cobol', 'cobolfree', - 'coffeescript', 'coldfusion-cfc', 'coldfusion-html', 'common-lisp', - 'component-pascal', 'coq', 'cpp-objdump', 'cpsa', 'crmsh', 'croc', - 'cryptol', 'csound-document', 'csound-orchestra', 'csound-score', - 'css', 'css+django/jinja', 'css+genshi-text', 'css+lasso', - 'css+mako', 'css+mozpreproc', 'css+myghty', 'css+php', 'css+ruby', - 'css+smarty', 'cuda', 'cypher', 'cython', 'd', 'd-objdump', - 'darcs-patch', 'dart', 'debian-control-file', 'debian-sourcelist', - 'delphi', 'dg', 'diff', 'django/jinja', 'docker', 'dtd', 'duel', - 'dylan', 'dylan-session', 'dylanlid', 'earl-grey', 'easytrieve', - 'ebnf', 'ec', 'ecl', 'eiffel', 'elixir', 'elixir-iex-session', - 'elm', 'emacslisp', 'embedded-ragel', 'erb', 'erlang', - 'erlang-erl-session', 'evoque', 'ezhil', 'factor', 'fancy', - 'fantom', 'felix', 'fish', 'fortran', 'fortranfixed', 'foxpro', - 'fsharp', 'gap', 'gas', 'genshi', 'genshi-text', 'gettext-catalog', - 'gherkin', 'glsl', 'gnuplot', 'go', 'golo', 'gooddata-cl', 'gosu', - 'gosu-template', 'groff', 'groovy', 'haml', 'handlebars', 'haskell', - 'haxe', 'hexdump', 'html', 'html+cheetah', 'html+django/jinja', - 'html+evoque', 'html+genshi', 'html+handlebars', 'html+lasso', - 'html+mako', 'html+myghty', 'html+php', 'html+smarty', 'html+twig', - 'html+velocity', 'http', 'hxml', 'hy', 'hybris', 'idl', 'idris', - 'igor', 'inform-6', 'inform-6-template', 'inform-7', 'ini', 'io', - 'ioke', 'irc-logs', 'isabelle', 'j', 'jade', 'jags', 'jasmin', - 'java', 'java-server-page', 'javascript', 'javascript+cheetah', - 'javascript+django/jinja', 'javascript+genshi-text', - 'javascript+lasso', 'javascript+mako', 'javascript+mozpreproc', - 'javascript+myghty', 'javascript+php', 'javascript+ruby', - 'javascript+smarty', 'jcl', 'json', 'json-ld', 'julia', - 'julia-console', 'kal', 'kconfig', 'koka', 'kotlin', 'lasso', - 'lean', 'lesscss', 'lighttpd-configuration-file', 'limbo', 'liquid', - 'literate-agda', 'literate-cryptol', 'literate-haskell', - 'literate-idris', 'livescript', 'llvm', 'logos', 'logtalk', 'lsl', - 'lua', 'makefile', 'mako', 'maql', 'mask', 'mason', 'mathematica', - 'matlab', 'matlab-session', 'minid', 'modelica', 'modula-2', - 'moinmoin/trac-wiki-markup', 'monkey', 'moocode', 'moonscript', - 'mozhashpreproc', 'mozpercentpreproc', 'mql', 'mscgen', - 'msdos-session', 'mupad', 'mxml', 'myghty', 'mysql', 'nasm', - 'nemerle', 'nesc', 'newlisp', 'newspeak', - 'nginx-configuration-file', 'nimrod', 'nit', 'nix', 'nsis', 'numpy', - 'objdump', 'objdump-nasm', 'objective-c', 'objective-c++', - 'objective-j', 'ocaml', 'octave', 'odin', 'ooc', 'opa', - 'openedge-abl', 'pacmanconf', 'pan', 'parasail', 'pawn', 'perl', - 'perl6', 'php', 'pig', 'pike', 'pkgconfig', 'pl/pgsql', - 'postgresql-console-(psql)', 'postgresql-sql-dialect', 'postscript', - 'povray', 'powershell', 'powershell-session', 'praat', 'prolog', - 'properties', 'protocol-buffer', 'puppet', 'pypy-log', 'python', - 'python-3', 'python-3.0-traceback', 'python-console-session', - 'python-traceback', 'qbasic', 'qml', 'qvto', 'racket', 'ragel', - 'ragel-in-c-host', 'ragel-in-cpp-host', 'ragel-in-d-host', - 'ragel-in-java-host', 'ragel-in-objective-c-host', - 'ragel-in-ruby-host', 'raw-token-data', 'rconsole', 'rd', 'rebol', - 'red', 'redcode', 'reg', 'resourcebundle', 'restructuredtext', - 'rexx', 'rhtml', 'roboconf-graph', 'roboconf-instances', - 'robotframework', 'rpmspec', 'rql', 'rsl', 'ruby', - 'ruby-irb-session', 'rust', 's', 'sass', 'scala', - 'scalate-server-page', 'scaml', 'scheme', 'scilab', 'scss', 'shen', - 'slim', 'smali', 'smalltalk', 'smarty', 'snobol', 'sourcepawn', - 'sparql', 'sql', 'sqlite3con', 'squidconf', 'stan', 'standard-ml', - 'supercollider', 'swift', 'swig', 'systemverilog', 'tads-3', 'tap', - 'tcl', 'tcsh', 'tcsh-session', 'tea', 'termcap', 'terminfo', - 'terraform', 'tex', 'text-only', 'thrift', 'todotxt', - 'trafficscript', 'treetop', 'turtle', 'twig', 'typescript', - 'urbiscript', 'vala', 'vb.net', 'vctreestatus', 'velocity', - 'verilog', 'vgl', 'vhdl', 'viml', 'x10', 'xml', 'xml+cheetah', - 'xml+django/jinja', 'xml+evoque', 'xml+lasso', 'xml+mako', - 'xml+myghty', 'xml+php', 'xml+ruby', 'xml+smarty', 'xml+velocity', - 'xquery', 'xslt', 'xtend', 'xul+mozpreproc', 'yaml', 'yaml+jinja', - 'zephir', 'unknown' -); -comment on type languages is 'Languages recognized by language indexer'; - -create type ctags_languages as enum ( 'Ada', 'AnsiblePlaybook', 'Ant', - 'Asm', 'Asp', 'Autoconf', 'Automake', 'Awk', 'Basic', 'BETA', 'C', - 'C#', 'C++', 'Clojure', 'Cobol', 'CoffeeScript [disabled]', 'CSS', - 'ctags', 'D', 'DBusIntrospect', 'Diff', 'DosBatch', 'DTS', 'Eiffel', - 'Erlang', 'Falcon', 'Flex', 'Fortran', 'gdbinit [disabled]', - 'Glade', 'Go', 'HTML', 'Iniconf', 'Java', 'JavaProperties', - 'JavaScript', 'JSON', 'Lisp', 'Lua', 'M4', 'Make', 'man [disabled]', - 'MatLab', 'Maven2', 'Myrddin', 'ObjectiveC', 'OCaml', 'OldC - [disabled]', 'OldC++ [disabled]', 'Pascal', 'Perl', 'Perl6', 'PHP', - 'PlistXML', 'pod', 'Protobuf', 'Python', 'PythonLoggingConfig', 'R', - 'RelaxNG', 'reStructuredText', 'REXX', 'RpmSpec', 'Ruby', 'Rust', - 'Scheme', 'Sh', 'SLang', 'SML', 'SQL', 'SVG', 'SystemdUnit', - 'SystemVerilog', 'Tcl', 'Tex', 'TTCN', 'Vera', 'Verilog', 'VHDL', - 'Vim', 'WindRes', 'XSLT', 'YACC', 'Yaml', 'YumRepo', 'Zephir' -); -comment on type ctags_languages is 'Languages recognized by ctags indexer'; diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql index 08587c3900547b387d8d1b73faeb6949b57d26e3..318fb695f1d46aa49414a60c598a9427a001d4fa 100644 --- a/swh/indexer/sql/30-schema.sql +++ b/swh/indexer/sql/30-schema.sql @@ -36,35 +36,6 @@ comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; --- Language metadata -create table content_language ( - id sha1 not null, - lang languages not null, - indexer_configuration_id bigint not null -); - -comment on table content_language is 'Language information on a raw content'; -comment on column content_language.lang is 'Language information'; -comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; - --- ctags information per content -create table content_ctags ( - id sha1 not null, - name text not null, - kind text not null, - line bigint not null, - lang ctags_languages not null, - indexer_configuration_id bigint not null -); - -comment on table content_ctags is 'Ctags information on a raw content'; -comment on column content_ctags.id is 'Content identifier'; -comment on column content_ctags.name is 'Symbol name'; -comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; -comment on column content_ctags.line is 'Symbol line'; -comment on column content_ctags.lang is 'Language information for that content'; -comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; - create table fossology_license( id smallserial, name text not null diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql index d459a4ab2c2bd32d5a7f63a1fc554413bebc90a9..85f292c6d802a9c9397a262a7cdeedf962c1239b 100644 --- a/swh/indexer/sql/50-func.sql +++ b/swh/indexer/sql/50-func.sql @@ -58,6 +58,7 @@ begin insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) select id, mimetype, encoding, indexer_configuration_id from tmp_content_mimetype tcm + order by id, indexer_configuration_id on conflict(id, indexer_configuration_id) do update set mimetype = excluded.mimetype, encoding = excluded.encoding; @@ -69,118 +70,6 @@ $$; comment on function swh_content_mimetype_add() IS 'Add new content mimetypes'; --- add tmp_content_language entries to content_language, overwriting duplicates. --- --- If filtering duplicates is in order, the call to --- swh_content_language_missing must take place before calling this --- function. --- --- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to --- tmp_content_language, 2. call this function -create or replace function swh_content_language_add() - returns bigint - language plpgsql -as $$ -declare - res bigint; -begin - insert into content_language (id, lang, indexer_configuration_id) - select id, lang, indexer_configuration_id - from tmp_content_language tcl - on conflict(id, indexer_configuration_id) - do update set lang = excluded.lang; - - get diagnostics res = ROW_COUNT; - return res; -end -$$; - -comment on function swh_content_language_add() IS 'Add new content languages'; - --- create a temporary table for retrieving content_language -create or replace function swh_mktemp_content_language() - returns void - language sql -as $$ - create temporary table if not exists tmp_content_language ( - like content_language including defaults - ) on commit delete rows; -$$; - -comment on function swh_mktemp_content_language() is 'Helper table to add content language'; - - --- create a temporary table for content_ctags tmp_content_ctags, -create or replace function swh_mktemp_content_ctags() - returns void - language sql -as $$ - create temporary table if not exists tmp_content_ctags ( - like content_ctags including defaults - ) on commit delete rows; -$$; - -comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; - - --- add tmp_content_ctags entries to content_ctags, overwriting duplicates --- --- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags, --- 2. call this function -create or replace function swh_content_ctags_add() - returns bigint - language plpgsql -as $$ -declare - res bigint; -begin - insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) - select id, name, kind, line, lang, indexer_configuration_id - from tmp_content_ctags tct - on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) - do nothing; - - get diagnostics res = ROW_COUNT; - return res; -end -$$; - -comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content'; - -create type content_ctags_signature as ( - id sha1, - name text, - kind text, - line bigint, - lang ctags_languages, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Search within ctags content. --- -create or replace function swh_content_ctags_search( - expression text, - l integer default 10, - last_sha1 sha1 default '\x0000000000000000000000000000000000000000') - returns setof content_ctags_signature - language sql -as $$ - select c.id, name, kind, line, lang, - i.id as tool_id, tool_name, tool_version, tool_configuration - from content_ctags c - inner join indexer_configuration i on i.id = c.indexer_configuration_id - where hash_sha1(name) = hash_sha1(expression) - and c.id > last_sha1 - order by id - limit l; -$$; - -comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols'; - - -- create a temporary table for content_fossology_license tmp_content_fossology_license, create or replace function swh_mktemp_content_fossology_license() returns void @@ -218,6 +107,7 @@ begin (select id from fossology_license where name = tcl.license) as license, indexer_configuration_id from tmp_content_fossology_license tcl + order by tcl.id, license, indexer_configuration_id on conflict(id, license_id, indexer_configuration_id) do update set license_id = excluded.license_id; @@ -237,7 +127,7 @@ comment on function swh_content_fossology_license_add() IS 'Add new content lice -- swh_content_metadata_missing must take place before calling this -- function. -- --- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- operates in bulk: 0. swh_mktemp(content_metadata), 1. COPY to -- tmp_content_metadata, 2. call this function create or replace function swh_content_metadata_add() returns bigint @@ -249,6 +139,7 @@ begin insert into content_metadata (id, metadata, indexer_configuration_id) select id, metadata, indexer_configuration_id from tmp_content_metadata tcm + order by id, indexer_configuration_id on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata; @@ -280,7 +171,7 @@ comment on function swh_mktemp_content_metadata() is 'Helper table to add conten -- swh_directory_intrinsic_metadata_missing must take place before calling this -- function. -- --- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- operates in bulk: 0. swh_mktemp(directory_intrinsic_metadata), 1. COPY to -- tmp_directory_intrinsic_metadata, 2. call this function create or replace function swh_directory_intrinsic_metadata_add() returns bigint @@ -292,6 +183,7 @@ begin insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) select id, metadata, mappings, indexer_configuration_id from tmp_directory_intrinsic_metadata tcm + order by id, indexer_configuration_id on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, @@ -345,7 +237,7 @@ $$; -- swh_origin_intrinsic_metadata_missing must take place before calling this -- function. -- --- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- operates in bulk: 0. swh_mktemp(origin_intrinsic_metadata), 1. COPY to -- tmp_origin_intrinsic_metadata, 2. call this function create or replace function swh_origin_intrinsic_metadata_add() returns bigint @@ -360,6 +252,7 @@ begin select id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata + order by id, indexer_configuration_id on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, @@ -418,7 +311,7 @@ $$; -- swh_origin_extrinsic_metadata_missing must take place before calling this -- function. -- --- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- operates in bulk: 0. swh_mktemp(origin_extrinsic_metadata), 1. COPY to -- tmp_origin_extrinsic_metadata, 2. call this function create or replace function swh_origin_extrinsic_metadata_add() returns bigint @@ -433,6 +326,7 @@ begin select id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings from tmp_origin_extrinsic_metadata + order by id, indexer_configuration_id on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, @@ -475,6 +369,7 @@ as $$ begin insert into indexer_configuration(tool_name, tool_version, tool_configuration) select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp + order by tool_name, tool_version, tool_configuration on conflict(tool_name, tool_version, tool_configuration) do nothing; return query diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql index 5b42af79eeeae36a1aca94db1c1637af9d1816ad..20fe3ca93fc21de91cb46e0ec2c69b1d7765e49a 100644 --- a/swh/indexer/sql/60-indexes.sql +++ b/swh/indexer/sql/60-indexes.sql @@ -10,14 +10,6 @@ alter table indexer_configuration add primary key using index indexer_configurat create unique index on indexer_configuration(tool_name, tool_version, tool_configuration); --- content_ctags -create index on content_ctags(id); -create index on content_ctags(hash_sha1(name)); -create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id); - -alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey; - -- content_metadata create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id); alter table content_metadata add primary key using index content_metadata_pkey; @@ -41,13 +33,6 @@ alter table content_mimetype validate constraint content_mimetype_indexer_config create index on content_mimetype(id) where mimetype like 'text/%'; --- content_language -create unique index content_language_pkey on content_language(id, indexer_configuration_id); -alter table content_language add primary key using index content_language_pkey; - -alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table content_language validate constraint content_language_indexer_configuration_id_fkey; - -- content_fossology_license create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id); alter table content_fossology_license add primary key using index content_fossology_license_pkey; diff --git a/swh/indexer/sql/upgrades/136.sql b/swh/indexer/sql/upgrades/136.sql new file mode 100644 index 0000000000000000000000000000000000000000..01499ac2574bc03a3747b6f7bec562fd4cf2d69c --- /dev/null +++ b/swh/indexer/sql/upgrades/136.sql @@ -0,0 +1,214 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 135 +-- to_version: 136 +-- description: Insert from temporary tables in consistent order + +insert into dbversion(version, release, description) + values(136, now(), 'Work In Progress'); + + +create or replace function swh_content_mimetype_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + order by id, indexer_configuration_id + on conflict(id, indexer_configuration_id) + do update set mimetype = excluded.mimetype, + encoding = excluded.encoding; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_content_language_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + order by id, indexer_configuration_id + on conflict(id, indexer_configuration_id) + do update set lang = excluded.lang; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_content_ctags_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) + select id, name, kind, line, lang, indexer_configuration_id + from tmp_content_ctags tct + order by id, hash_sha1(name), kind, line, lang, indexer_configuration_id + on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) + do nothing; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_content_fossology_license_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + -- insert unknown licenses first + insert into fossology_license (name) + select distinct license from tmp_content_fossology_license tmp + where not exists (select 1 from fossology_license where name=tmp.license) + on conflict(name) do nothing; + + insert into content_fossology_license (id, license_id, indexer_configuration_id) + select tcl.id, + (select id from fossology_license where name = tcl.license) as license, + indexer_configuration_id + from tmp_content_fossology_license tcl + order by tcl.id, license, indexer_configuration_id + on conflict(id, license_id, indexer_configuration_id) + do update set license_id = excluded.license_id; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_content_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id + from tmp_content_metadata tcm + order by id, indexer_configuration_id + on conflict(id, indexer_configuration_id) + do update set metadata = excluded.metadata; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_directory_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_directory_intrinsic_metadata tcm + order by id, indexer_configuration_id + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_origin_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); + + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_directory, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + order by id, indexer_configuration_id + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_directory = excluded.from_directory; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_origin_extrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_extrinsic_metadata_compute_tsvector(); + + insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_remd_id, + metadata_tsvector, mappings + from tmp_origin_extrinsic_metadata + order by id, indexer_configuration_id + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_remd_id = excluded.from_remd_id; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + + +create or replace function swh_indexer_configuration_add() + returns setof indexer_configuration + language plpgsql +as $$ +begin + insert into indexer_configuration(tool_name, tool_version, tool_configuration) + select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp + order by tool_name, tool_version, tool_configuration + on conflict(tool_name, tool_version, tool_configuration) do nothing; + + return query + select id, tool_name, tool_version, tool_configuration + from tmp_indexer_configuration join indexer_configuration + using(tool_name, tool_version, tool_configuration); + + return; +end +$$; + + diff --git a/swh/indexer/sql/upgrades/137.sql b/swh/indexer/sql/upgrades/137.sql new file mode 100644 index 0000000000000000000000000000000000000000..152ae0ee4966b1c31a4853e8d0ff8e104bab3d09 --- /dev/null +++ b/swh/indexer/sql/upgrades/137.sql @@ -0,0 +1,19 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 136 +-- to_version: 137 +-- description: Drop content_language and content_ctags tables and related functions + +drop function if exists swh_content_language_add; +drop function if exists swh_mktemp_content_language(); +drop function if exists swh_mktemp_content_ctags(); +drop function if exists swh_content_ctags_add(); +drop function if exists swh_content_ctags_search; + +drop type if exists content_ctags_signature; + +drop table if exists content_language; +drop table if exists content_ctags; + +drop type if exists languages; +drop type if exists ctags_languages; + diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 2c7bbc238e64b4f70dd726d6ca7329a4d98b9002..261dc525b5f38ed4724cea7fdcd143e70089ecde 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -9,6 +9,7 @@ import json from typing import Dict, Iterable, List, Optional, Tuple, Union import warnings +import attr import psycopg2 import psycopg2.pool @@ -115,17 +116,19 @@ def check_id_duplicates(data): Args: data (List[dict]): List of dictionaries to be inserted + >>> tool1 = {"name": "foo", "version": "1.2.3", "configuration": {}} + >>> tool2 = {"name": "foo", "version": "1.2.4", "configuration": {}} >>> check_id_duplicates([ - ... ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="GPL"), - ... ContentLicenseRow(id=b'foo', indexer_configuration_id=32, license="GPL"), + ... ContentLicenseRow(id=b'foo', tool=tool1, license="GPL"), + ... ContentLicenseRow(id=b'foo', tool=tool2, license="GPL"), ... ]) >>> check_id_duplicates([ - ... ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="AGPL"), - ... ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="AGPL"), + ... ContentLicenseRow(id=b'foo', tool=tool1, license="AGPL"), + ... ContentLicenseRow(id=b'foo', tool=tool1, license="AGPL"), ... ]) Traceback (most recent call last): ... - swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42, 'license': 'AGPL'}] + swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'license': 'AGPL', 'tool_configuration': '{}', 'tool_name': 'foo', 'tool_version': '1.2.3'}] """ # noqa counter = Counter(tuple(sorted(item.unique_key().items())) for item in data) @@ -137,7 +140,7 @@ def check_id_duplicates(data): class IndexerStorage: """SWH Indexer Storage Datastore""" - current_version = 135 + current_version = 137 def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None): """ @@ -147,7 +150,7 @@ class IndexerStorage: `swh.journal.writer.get_journal_writer` """ - self.journal_writer = JournalWriter(self._tool_get_from_id, journal_writer) + self.journal_writer = JournalWriter(journal_writer) try: if isinstance(db, psycopg2.extensions.connection): self._pool = None @@ -169,6 +172,32 @@ class IndexerStorage: if db is not self._db: db.put_conn() + def _join_indexer_configuration(self, entries, db, cur): + """Replaces ``entry.indexer_configuration_id`` with a full tool dict + in ``entry.tool``.""" + joined_entries = [] + + # usually, all the additions in a batch are from the same indexer, + # so this cache allows doing a single query for all the entries. + tool_cache = {} + + for entry in entries: + # get the tool used to generate this addition + tool_id = entry.indexer_configuration_id + assert tool_id + if tool_id not in tool_cache: + tool_cache[tool_id] = dict( + self._tool_get_from_id(tool_id, db=db, cur=cur) + ) + del tool_cache[tool_id]["id"] + entry = attr.evolve( + entry, tool=tool_cache[tool_id], indexer_configuration_id=None + ) + + joined_entries.append(entry) + + return joined_entries + @timed @db_transaction() def check_config(self, *, check_write, db=None, cur=None): @@ -293,9 +322,11 @@ class IndexerStorage: db=None, cur=None, ) -> Dict[str, int]: - check_id_duplicates(mimetypes) - mimetypes.sort(key=lambda m: m.id) - self.journal_writer.write_additions("content_mimetype", mimetypes) + mimetypes_with_tools = self._join_indexer_configuration( + mimetypes, db=db, cur=cur + ) + check_id_duplicates(mimetypes_with_tools) + self.journal_writer.write_additions("content_mimetype", mimetypes_with_tools) db.mktemp_content_mimetype(cur) db.copy_to( [m.to_dict() for m in mimetypes], @@ -341,9 +372,11 @@ class IndexerStorage: db=None, cur=None, ) -> Dict[str, int]: - check_id_duplicates(licenses) - licenses.sort(key=lambda m: m.id) - self.journal_writer.write_additions("content_fossology_license", licenses) + licenses_with_tools = self._join_indexer_configuration(licenses, db=db, cur=cur) + check_id_duplicates(licenses_with_tools) + self.journal_writer.write_additions( + "content_fossology_license", licenses_with_tools + ) db.mktemp_content_fossology_license(cur) db.copy_to( [license.to_dict() for license in licenses], @@ -406,9 +439,9 @@ class IndexerStorage: db=None, cur=None, ) -> Dict[str, int]: - check_id_duplicates(metadata) - metadata.sort(key=lambda m: m.id) - self.journal_writer.write_additions("content_metadata", metadata) + metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur) + check_id_duplicates(metadata_with_tools) + self.journal_writer.write_additions("content_metadata", metadata_with_tools) db.mktemp_content_metadata(cur) @@ -460,9 +493,11 @@ class IndexerStorage: db=None, cur=None, ) -> Dict[str, int]: - check_id_duplicates(metadata) - metadata.sort(key=lambda m: m.id) - self.journal_writer.write_additions("directory_intrinsic_metadata", metadata) + metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur) + check_id_duplicates(metadata_with_tools) + self.journal_writer.write_additions( + "directory_intrinsic_metadata", metadata_with_tools + ) db.mktemp_directory_intrinsic_metadata(cur) @@ -504,9 +539,11 @@ class IndexerStorage: db=None, cur=None, ) -> Dict[str, int]: - check_id_duplicates(metadata) - metadata.sort(key=lambda m: m.id) - self.journal_writer.write_additions("origin_intrinsic_metadata", metadata) + metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur) + check_id_duplicates(metadata_with_tools) + self.journal_writer.write_additions( + "origin_intrinsic_metadata", metadata_with_tools + ) db.mktemp_origin_intrinsic_metadata(cur) @@ -646,9 +683,11 @@ class IndexerStorage: db=None, cur=None, ) -> Dict[str, int]: - check_id_duplicates(metadata) - metadata.sort(key=lambda m: m.id) - self.journal_writer.write_additions("origin_extrinsic_metadata", metadata) + metadata_with_tools = self._join_indexer_configuration(metadata, db=db, cur=cur) + check_id_duplicates(metadata_with_tools) + self.journal_writer.write_additions( + "origin_extrinsic_metadata", metadata_with_tools + ) db.mktemp_origin_extrinsic_metadata(cur) diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py index 020dd2bc47787fabd95d4ec3c21a7fc29c92df00..4bad74c424c4f316fdfe7f6cff3fb1fc9665adb9 100644 --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -42,6 +42,9 @@ def my_error_handler(exception): return error_handler(exception, encode_data) +app.setup_psycopg2_errorhandlers() + + @app.errorhandler(IndexerStorageArgumentException) def argument_error_handler(exception): return error_handler(exception, encode_data, status_code=400) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index fc4c9ef3e73e0df2d4f111097f44cd93ba903b4c..a99083796f10c7bdf8961bf5fc3f4ab6c654e31f 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -23,6 +23,8 @@ from typing import ( Union, ) +import attr + from swh.core.collections import SortedList from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import SHA1_SIZE @@ -83,6 +85,30 @@ class SubStorage(Generic[TValue]): self._journal_writer = journal_writer self._tools_per_id = defaultdict(set) + def _join_indexer_configuration(self, entries): + """Replaces ``entry.indexer_configuration_id`` with a full tool dict + in ``entry.tool``.""" + joined_entries = [] + + for entry in entries: + # get the tool used to generate this addition + tool_id = entry.indexer_configuration_id + assert tool_id + tool = self._tools[tool_id] + entry = attr.evolve( + entry, + tool={ + "name": tool["tool_name"], + "version": tool["tool_version"], + "configuration": tool["tool_configuration"], + }, + indexer_configuration_id=None, + ) + + joined_entries.append(entry) + + return joined_entries + def _key_from_dict(self, d) -> Tuple: """Like the global _key_from_dict, but filters out dict keys that don't belong in the unique key.""" @@ -210,15 +236,16 @@ class SubStorage(Generic[TValue]): """ data = list(data) - check_id_duplicates(data) + data_with_tools = self._join_indexer_configuration(data) + check_id_duplicates(data_with_tools) object_type = self.row_class.object_type # type: ignore - self._journal_writer.write_additions(object_type, data) + self._journal_writer.write_additions(object_type, data_with_tools) count = 0 - for obj in data: + for (obj, obj_with_tool) in zip(data, data_with_tools): item = obj.to_dict() id_ = item.pop("id") tool_id = item["indexer_configuration_id"] - key = _key_from_dict(obj.unique_key()) + key = _key_from_dict(obj_with_tool.unique_key()) self._data[id_][key] = item self._tools_per_id[id_].add(tool_id) count += 1 @@ -233,16 +260,7 @@ class IndexerStorage: def __init__(self, journal_writer=None): self._tools = {} - def tool_getter(id_): - tool = self._tools[id_] - return { - "id": tool["id"], - "name": tool["tool_name"], - "version": tool["tool_version"], - "configuration": tool["tool_configuration"], - } - - self.journal_writer = JournalWriter(tool_getter, journal_writer) + self.journal_writer = JournalWriter(journal_writer) args = (self._tools, self.journal_writer) self._mimetypes = SubStorage(ContentMimetypeRow, *args) self._licenses = SubStorage(ContentLicenseRow, *args) diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py index c05071ab595a7ce4c9d4ff69459a8a5f1a66a19a..ab8fa89f18f79fa1c2ecb94e14deed26591f98ca 100644 --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -8,6 +8,7 @@ used for the interface of the idx-storage in the near future.""" from __future__ import annotations +import json from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar import attr @@ -20,7 +21,7 @@ TSelf = TypeVar("TSelf") @attr.s class BaseRow: - UNIQUE_KEY_FIELDS: Tuple = ("id", "indexer_configuration_id") + UNIQUE_KEY_FIELDS: Tuple = ("id",) id = attr.ib(type=Any) indexer_configuration_id = attr.ib(type=Optional[int], default=None, kw_only=True) @@ -55,15 +56,24 @@ class BaseRow: return cls(**d) def unique_key(self) -> Dict: - obj = self + if not self.tool: + raise ValueError( + f"Cannot compute unique_key of {self.__class__.__name__} with no tool " + f"dictionary (indexer_configuration_id was given instead)" + ) - # tool["id"] and obj.indexer_configuration_id are the same value, but - # only one of them is set for any given object - if obj.indexer_configuration_id is None: - assert obj.tool # constructors ensures tool XOR indexer_configuration_id - obj = attr.evolve(obj, indexer_configuration_id=obj.tool["id"], tool=None) + tool_dict = { + "tool_name": self.tool["name"], + "tool_version": self.tool["version"], + "tool_configuration": json.dumps( + self.tool["configuration"], sort_keys=True + ), + } - return {key: getattr(obj, key) for key in self.UNIQUE_KEY_FIELDS} + return { + **{key: getattr(self, key) for key in self.UNIQUE_KEY_FIELDS}, + **tool_dict, + } @attr.s @@ -78,7 +88,7 @@ class ContentMimetypeRow(BaseRow): @attr.s class ContentLicenseRow(BaseRow): object_type: Final = "content_fossology_license" - UNIQUE_KEY_FIELDS = ("id", "indexer_configuration_id", "license") + UNIQUE_KEY_FIELDS = ("id", "license") id = attr.ib(type=Sha1Git) license = attr.ib(type=str) diff --git a/swh/indexer/storage/writer.py b/swh/indexer/storage/writer.py index b4fa3658a63255467828479e2c4761a32359cf6c..e0897592f74276366732a9f3973d2760e2804cd4 100644 --- a/swh/indexer/storage/writer.py +++ b/swh/indexer/storage/writer.py @@ -1,11 +1,9 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Callable, Dict, Iterable, Optional - -import attr +from typing import Any, Dict, Iterable, Optional try: from swh.journal.writer import JournalWriterInterface, get_journal_writer @@ -24,15 +22,12 @@ class JournalWriter: journal: Optional[JournalWriterInterface] - def __init__(self, tool_getter: Callable[[int], Dict[str, Any]], journal_writer): + def __init__(self, journal_writer: Dict[str, Any]): """ Args: - tool_getter: a callable that takes a tool_id and return a dict representing - a tool object journal_writer: configuration passed to `swh.journal.writer.get_journal_writer` """ - self._tool_getter = tool_getter if journal_writer: if get_journal_writer is None: raise EnvironmentError( @@ -50,20 +45,25 @@ class JournalWriter: if not self.journal: return - # usually, all the additions in a batch are from the same indexer, - # so this cache allows doing a single query for all the entries. - tool_cache = {} + translated = [] for entry in entries: assert entry.object_type == obj_type # type: ignore - # get the tool used to generate this addition - tool_id = entry.indexer_configuration_id - assert tool_id - if tool_id not in tool_cache: - tool_cache[tool_id] = self._tool_getter(tool_id) - entry = attr.evolve( - entry, tool=tool_cache[tool_id], indexer_configuration_id=None - ) - # write to kafka - self.journal.write_addition(obj_type, entry) + # ids are internal to the database and should not be sent to postgresql + if entry.indexer_configuration_id is not None: + raise ValueError( + f"{entry} passed to JournalWriter.write_additions has " + f"indexer_configuration_id instead of full tool dict" + ) + assert entry.tool, "Missing both indexer_configuration_id and tool dict" + if "id" in entry.tool: + raise ValueError( + f"{entry} passed to JournalWriter.write_additions " + f"contains a tool id" + ) + + translated.append(entry) + + # write to kafka + self.journal.write_additions(obj_type, translated) diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py index d1ec3ba3d643328f111e815daeacfdd9e12f2029..29a8de7507d794a11240c48a1260d23f4ab0f753 100644 --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -11,7 +11,6 @@ from unittest.mock import patch import pytest from pytest_postgresql import factories -import sentry_sdk import yaml from swh.core.db.pytest_plugin import initialize_database_for_module @@ -131,40 +130,3 @@ def swh_config(swh_indexer_config, monkeypatch, tmp_path): f.write(yaml.dump(swh_indexer_config)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) return conffile - - -@pytest.fixture -def sentry_init(): - # Inspired by - # https://github.com/getsentry/sentry-python/blob/1.5.9/tests/conftest.py#L168-L184 - - initialized = False - - def inner(*a, **kw): - nonlocal initialized - assert not initialized, "already initialized" - initialized = True - hub = sentry_sdk.Hub.current - client = sentry_sdk.Client(*a, **kw) - hub.bind_client(client) - client.transport = TestTransport() - - class TestTransport: - def __init__(self): - self.events = [] - self.envelopes = [] - - def capture_event(self, event): - self.events.append(event) - - def capture_envelope(self, envelope): - self.envelopes.append(envelope) - - with sentry_sdk.Hub(None): - yield inner - - -@pytest.fixture -def sentry_events(monkeypatch, sentry_init): - sentry_init() - return sentry_sdk.Hub.current.client.transport.events diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py index 21865ee4bea30d6189909ac6be046b8ecc61c639..6c9d6def061c33197d9d21f80b04e6cb24760dff 100644 --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -6,6 +6,7 @@ import json from hypothesis import HealthCheck, given, settings +import pytest from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_detector import detect_metadata @@ -213,6 +214,7 @@ def test_sword_basics(): <codemeta:author> <codemeta:name>Author 2</codemeta:name> </codemeta:author> + <codemeta:dateCreated>2022-10-26</codemeta:dateCreated> <author> <name>Author 3</name> <email>bar@example.org</email> @@ -229,6 +231,7 @@ def test_sword_basics(): {"name": "Author 2"}, {"name": "Author 3", "email": "bar@example.org"}, ], + "dateCreated": "2022-10-26", } @@ -252,6 +255,117 @@ def test_sword_mixed(): } +@pytest.mark.parametrize("id_", ["", " ", "\n"]) +def test_sword_invalid_id(id_): + content = f"""<?xml version="1.0"?> + <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" + xmlns="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>My Software</name> + <id>{id_}</id> + </atom:entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "My Software", + } + + +@pytest.mark.parametrize( + "id_", + [ + "foo", + "42", + "http://example.org/", + "http://example.org/foo", + "https://example.org/", + "https://example.org/foo", + ], +) +def test_sword_id(id_): + content = f"""<?xml version="1.0"?> + <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" + xmlns="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>My Software</name> + <id>{id_}</id> + </atom:entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": id_, + "name": "My Software", + } + + +def test_sword_multiple_ids(): + """JSON-LD only allows a single id, so we ignore all but the first one.""" + content = """<?xml version="1.0"?> + <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" + xmlns="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>My Software</name> + <id>http://example.org/foo</id> + <id>http://example.org/bar</id> + </atom:entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/foo", + "name": "My Software", + } + + +def test_sword_type(): + content = """<?xml version="1.0"?> + <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" + xmlns="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>My Software</name> + <type>http://schema.org/WebSite</type> + </atom:entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "schema:WebSite", + "name": "My Software", + } + + +def test_sword_multiple_type(): + content = """<?xml version="1.0"?> + <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" + xmlns="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>My Software</name> + <type>http://schema.org/WebSite</type> + <type>http://schema.org/SoftwareSourceCode</type> + </atom:entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result in ( + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["schema:WebSite", "SoftwareSourceCode"], + "name": "My Software", + }, + { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": ["SoftwareSourceCode", "schema:WebSite"], + "name": "My Software", + }, + ) + + def test_sword_schemaorg_in_codemeta(): content = """<?xml version="1.0"?> <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" @@ -273,13 +387,16 @@ def test_sword_schemaorg_in_codemeta(): def test_sword_schemaorg_in_codemeta_constrained(): """Resulting property has the compact URI 'schema:url' instead of just the term 'url', because term 'url' is defined by the Codemeta schema - has having type '@id'.""" + has having type '@id'. + Ditto for dates (with type http://schema.org/Date).""" content = """<?xml version="1.0"?> <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" xmlns="https://doi.org/10.5063/schema/codemeta-2.0" xmlns:schema="http://schema.org/"> <name>My Software</name> <schema:url>http://example.org/my-software</schema:url> + <schema:dateCreated>foo</schema:dateCreated> + <schema:dateModified>2022-10-26</schema:dateModified> </atom:entry> """ @@ -288,6 +405,8 @@ def test_sword_schemaorg_in_codemeta_constrained(): "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "My Software", "schema:url": "http://example.org/my-software", + "schema:dateCreated": "foo", + "schema:dateModified": "2022-10-26", } @@ -351,6 +470,54 @@ def test_sword_multiple_names(): } +def test_sword_propertyvalue(): + content = """<?xml version="1.0"?> + <entry xmlns="http://www.w3.org/2005/Atom" + xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>Name</name> + <schema:identifier> + <codemeta:type>schema:PropertyValue</codemeta:type> + <schema:propertyID>HAL-ID</schema:propertyID> + <schema:value>hal-03780423</schema:value> + </schema:identifier> + </entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "Name", + "identifier": { + "schema:propertyID": "HAL-ID", + "schema:value": "hal-03780423", + "type": "schema:PropertyValue", + }, + } + + +def test_sword_fix_date(): + content = """<?xml version="1.0"?> + <entry xmlns="http://www.w3.org/2005/Atom" + xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>Name</name> + <codemeta:dateModified>2020-12-1</codemeta:dateModified> + <codemeta:dateCreated>2020-12-2</codemeta:dateCreated> + <codemeta:datePublished>2020-12-3</codemeta:datePublished> + </entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "Name", + "dateModified": "2020-12-01", + "dateCreated": "2020-12-02", + "datePublished": "2020-12-03", + } + + def test_json_sword(): content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}""" # noqa result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content) @@ -362,6 +529,6 @@ def test_json_sword(): ], "license": {"name": "GNU General Public License v3.0 or later"}, "name": "The assignment problem", - "schema:url": "http://example.org/", + "url": "http://example.org/", "name": "The assignment problem", } diff --git a/swh/indexer/tests/metadata_dictionary/test_dart.py b/swh/indexer/tests/metadata_dictionary/test_dart.py index 956d0885329d19da726916485ebc5c33a514c6f0..9dad26379bc5a8801ff9b15db4d836282f548269 100644 --- a/swh/indexer/tests/metadata_dictionary/test_dart.py +++ b/swh/indexer/tests/metadata_dictionary/test_dart.py @@ -9,7 +9,7 @@ from swh.indexer.metadata_dictionary import MAPPINGS def test_compute_metadata_pubspec(): - raw_content = """ + raw_content = b""" --- name: newtify description: >- @@ -37,9 +37,7 @@ dependencies: dev_dependencies: test: '>=1.15.0 <2.0.0' - """.encode( - "utf-8" - ) + """ result = MAPPINGS["PubMapping"]().translate(raw_content) @@ -66,11 +64,9 @@ for.""", def test_normalize_author_pubspec(): - raw_content = """ + raw_content = b""" author: Atlee Pine <atlee@example.org> - """.encode( - "utf-8" - ) + """ result = MAPPINGS["PubMapping"]().translate(raw_content) @@ -86,13 +82,11 @@ def test_normalize_author_pubspec(): def test_normalize_authors_pubspec(): - raw_content = """ + raw_content = b""" authors: - Vicky Merzown <vmz@example.org> - Ron Bilius Weasley - """.encode( - "utf-8" - ) + """ result = MAPPINGS["PubMapping"]().translate(raw_content) @@ -113,14 +107,12 @@ def test_normalize_authors_pubspec(): @pytest.mark.xfail(reason="https://github.com/w3c/json-ld-api/issues/547") def test_normalize_author_authors_pubspec(): - raw_content = """ + raw_content = b""" authors: - Vicky Merzown <vmz@example.org> - Ron Bilius Weasley author: Hermione Granger - """.encode( - "utf-8" - ) + """ result = MAPPINGS["PubMapping"]().translate(raw_content) @@ -144,11 +136,9 @@ def test_normalize_author_authors_pubspec(): def test_normalize_empty_authors(): - raw_content = """ + raw_content = b""" authors: - """.encode( - "utf-8" - ) + """ result = MAPPINGS["PubMapping"]().translate(raw_content) @@ -158,3 +148,14 @@ def test_normalize_empty_authors(): } assert result == expected + + +def test_invalid_yaml(): + raw_content = b""" + name: smartech_push + license: { :type => "Commercial", :file => "LICENSE" } + """ + + result = MAPPINGS["PubMapping"]().translate(raw_content) + + assert result is None diff --git a/swh/indexer/tests/metadata_dictionary/test_gitea.py b/swh/indexer/tests/metadata_dictionary/test_gitea.py new file mode 100644 index 0000000000000000000000000000000000000000..b1dec7c0d227260cbd9f264f9db3059ab8dad9ea --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_gitea.py @@ -0,0 +1,143 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.metadata_dictionary import MAPPINGS + +CONTEXT = [ + "https://doi.org/10.5063/schema/codemeta-2.0", + { + "as": "https://www.w3.org/ns/activitystreams#", + "forge": "https://forgefed.org/ns#", + }, +] + + +def test_compute_metadata_none(): + """ + testing content empty content is empty + should return None + """ + content = b"" + + # None if no metadata was found or an error occurred + declared_metadata = None + result = MAPPINGS["GiteaMapping"]().translate(content) + assert declared_metadata == result + + +def test_supported_terms(): + terms = MAPPINGS["GiteaMapping"].supported_terms() + assert { + "http://schema.org/name", + "http://schema.org/dateCreated", + "https://forgefed.org/ns#forks", + "https://www.w3.org/ns/activitystreams#totalItems", + } <= terms + + +def test_compute_metadata_gitea(): + content = b""" +{ + "id": 48043, + "owner": { + "id": 48018, + "login": "ForgeFed", + "full_name": "", + "email": "", + "avatar_url": "https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33", + "language": "", + "is_admin": false, + "last_login": "0001-01-01T00:00:00Z", + "created": "2022-04-30T20:13:17+02:00", + "restricted": false, + "active": false, + "prohibit_login": false, + "location": "", + "website": "https://forgefed.org/", + "description": "", + "visibility": "public", + "followers_count": 0, + "following_count": 0, + "starred_repos_count": 0, + "username": "ForgeFed" + }, + "name": "ForgeFed", + "full_name": "ForgeFed/ForgeFed", + "description": "ActivityPub-based forge federation protocol specification", + "empty": false, + "private": false, + "fork": false, + "template": false, + "parent": null, + "mirror": false, + "size": 3780, + "language": "CSS", + "languages_url": "https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages", + "html_url": "https://codeberg.org/ForgeFed/ForgeFed", + "ssh_url": "git@codeberg.org:ForgeFed/ForgeFed.git", + "clone_url": "https://codeberg.org/ForgeFed/ForgeFed.git", + "original_url": "https://notabug.org/peers/forgefed", + "website": "https://forgefed.org", + "stars_count": 30, + "forks_count": 6, + "watchers_count": 11, + "open_issues_count": 61, + "open_pr_counter": 10, + "release_counter": 0, + "default_branch": "main", + "archived": false, + "created_at": "2022-06-13T18:54:26+02:00", + "updated_at": "2022-09-02T03:57:22+02:00", + "permissions": { + "admin": false, + "push": false, + "pull": true + }, + "has_issues": true, + "internal_tracker": { + "enable_time_tracker": true, + "allow_only_contributors_to_track_time": true, + "enable_issue_dependencies": true + }, + "has_wiki": false, + "has_pull_requests": true, + "has_projects": true, + "ignore_whitespace_conflicts": false, + "allow_merge_commits": false, + "allow_rebase": false, + "allow_rebase_explicit": false, + "allow_squash_merge": true, + "default_merge_style": "squash", + "avatar_url": "", + "internal": false, + "mirror_interval": "", + "mirror_updated": "0001-01-01T00:00:00Z", + "repo_transfer": null +} + """ + result = MAPPINGS["GiteaMapping"]().translate(content) + assert result == { + "@context": CONTEXT, + "type": "forge:Repository", + "id": "https://codeberg.org/ForgeFed/ForgeFed", + "forge:forks": { + "as:totalItems": 6, + "type": "as:OrderedCollection", + }, + "as:likes": { + "as:totalItems": 30, + "type": "as:Collection", + }, + "as:followers": { + "as:totalItems": 11, + "type": "as:Collection", + }, + "name": "ForgeFed", + "description": "ActivityPub-based forge federation protocol specification", + "codeRepository": "https://codeberg.org/ForgeFed/ForgeFed.git", + "dateCreated": "2022-06-13T18:54:26+02:00", + "dateModified": "2022-09-02T03:57:22+02:00", + "url": "https://forgefed.org", + } diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py index c0592dccd79555e82a3a5e02741e667f6d1f9fe2..0ab595f47c5dc6c2ddb8771203099c5476b10721 100644 --- a/swh/indexer/tests/metadata_dictionary/test_github.py +++ b/swh/indexer/tests/metadata_dictionary/test_github.py @@ -32,15 +32,13 @@ def test_supported_terms(): assert { "http://schema.org/name", "http://schema.org/license", + "http://schema.org/dateCreated", "https://forgefed.org/ns#forks", "https://www.w3.org/ns/activitystreams#totalItems", } <= terms def test_compute_metadata_github(): - """ - testing only computation of metadata with hard_mapping_npm - """ content = b""" { "id": 80521091, @@ -65,6 +63,8 @@ def test_compute_metadata_github(): "created_at": "2017-01-31T13:05:39Z", "updated_at": "2022-06-22T08:02:20Z", "pushed_at": "2022-06-29T09:01:08Z", + "archive_url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer/{archive_format}{/ref}", + "issues_url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer/issues{/number}", "git_url": "git://github.com/SoftwareHeritage/swh-indexer.git", "ssh_url": "git@github.com:SoftwareHeritage/swh-indexer.git", "clone_url": "https://github.com/SoftwareHeritage/swh-indexer.git", @@ -116,11 +116,12 @@ def test_compute_metadata_github(): "subscribers_count": 6 } - """ + """ # noqa result = MAPPINGS["GitHubMapping"]().translate(content) assert result == { "@context": CONTEXT, "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", "forge:forks": { "as:totalItems": 1, "type": "as:OrderedCollection", @@ -136,7 +137,42 @@ def test_compute_metadata_github(): "license": "https://spdx.org/licenses/GPL-3.0", "name": "SoftwareHeritage/swh-indexer", "description": "GitHub mirror of Metadata indexer", - "schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer", - "schema:dateCreated": "2017-01-31T13:05:39Z", - "schema:dateModified": "2022-06-22T08:02:20Z", + "codeRepository": "https://github.com/SoftwareHeritage/swh-indexer.git", + "dateCreated": "2017-01-31T13:05:39Z", + "dateModified": "2022-06-22T08:02:20Z", + } + + +def test_github_topics(): + content = b""" +{ + "html_url": "https://github.com/SoftwareHeritage/swh-indexer", + "topics": [ + "foo", + "bar" + ] +} + """ + result = MAPPINGS["GitHubMapping"]().translate(content) + assert set(result.pop("keywords", [])) == {"foo", "bar"}, result + assert result == { + "@context": CONTEXT, + "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", + } + + +def test_github_issues(): + content = b""" +{ + "html_url": "https://github.com/SoftwareHeritage/swh-indexer", + "has_issues": true +} + """ + result = MAPPINGS["GitHubMapping"]().translate(content) + assert result == { + "@context": CONTEXT, + "type": "forge:Repository", + "id": "https://github.com/SoftwareHeritage/swh-indexer", + "issueTracker": "https://github.com/SoftwareHeritage/swh-indexer/issues", } diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py index 0267e95214591fe86422012a6e2b00072f45fee5..afde286d8ebcfe5b949fb5cbefc1af5954a3f8ad 100644 --- a/swh/indexer/tests/metadata_dictionary/test_maven.py +++ b/swh/indexer/tests/metadata_dictionary/test_maven.py @@ -353,6 +353,47 @@ def test_compute_metadata_maven_multiple(): } +def test_compute_metadata_maven_invalid_repository(): + raw_content = b""" + <project> + <name>Maven Default Project</name> + <modelVersion>4.0.0</modelVersion> + <groupId>com.mycompany.app</groupId> + <artifactId>my-app</artifactId> + <version>1.2.3</version> + <repositories> + <repository> + <id>tcc-transaction-internal-releases</id> + <name>internal repository for released artifacts</name> + <url>${repo.internal.releases.url}</url> + <snapshots> + <enabled>false</enabled> + </snapshots> + <releases> + <enabled>true</enabled> + </releases> + </repository> + </repositories> + <licenses> + <license> + <name>Apache License, Version 2.0</name> + <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url> + <distribution>repo</distribution> + <comments>A business-friendly OSS license</comments> + </license> + </licenses> + </project>""" + result = MAPPINGS["MavenMapping"]().translate(raw_content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "schema:identifier": "com.mycompany.app", + "version": "1.2.3", + "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", + } + + @settings(suppress_health_check=[HealthCheck.too_slow]) @given( xml_document_strategy( diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py index b0ead256dc0aa5af21847a858499942a949e1248..08f8ea668c38987af5f33d726153eac17bf51712 100644 --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -294,6 +294,131 @@ def test_npm_repository_normalization(): } +def test_npm_author(): + package_json = rb"""{ + "version": "1.0.0", + "author": "Foo Bar (@example)" +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "Foo Bar", "type": "Person"}], + "version": "1.0.0", + } + + +def test_npm_invalid_uris(): + package_json = rb"""{ + "version": "1.0.0", + "homepage": "", + "author": { + "name": "foo", + "url": "http://example.org" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "homepage": "http://example.org", + "author": { + "name": "foo", + "url": "" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person"}], + "url": "http://example.org", + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "homepage": "", + "author": { + "name": "foo", + "url": "" + }, + "bugs": "" +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person"}], + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "homepage": "http:example.org", + "author": { + "name": "foo", + "url": "http:example.com" + }, + "bugs": { + "url": "http:example.com" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person"}], + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "repository": "git+https://g ithub.com/foo/bar.git" +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "version": "1.0.0", + } + + package_json = rb"""{ + "version": "1.0.0", + "repository": "git+http://\\u001b[D\\u001b[D\\u001b[Ds\\u001b[C\\u001b[C\\u001b[D\\u001b://github.com/dearzoe/array-combination" +}""" # noqa + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "version": "1.0.0", + } + + +def test_npm_invalid_licenses(): + package_json = rb"""{ + "version": "1.0.0", + "license": "SEE LICENSE IN LICENSE.md", + "author": { + "name": "foo", + "url": "http://example.org" + } +}""" + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], + "version": "1.0.0", + } + + @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore def test_npm_adversarial(doc): diff --git a/swh/indexer/tests/storage/test_api_client.py b/swh/indexer/tests/storage/test_api_client.py index 250b6d870b58f1112a1a11f56b66387fb29f726c..3620d73fab85abf27a407e24222e7d9a055a8565 100644 --- a/swh/indexer/tests/storage/test_api_client.py +++ b/swh/indexer/tests/storage/test_api_client.py @@ -1,10 +1,12 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import psycopg2 import pytest +from swh.core.api import RemoteException, TransientRemoteException from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.client import RemoteStorage import swh.indexer.storage.api.server as server @@ -54,3 +56,46 @@ def swh_indexer_storage(swh_rpc_client, app_server): storage.journal_writer = app_server.storage.journal_writer yield storage storage.journal_writer = journal_writer + + +def test_exception(app_server, swh_indexer_storage, mocker): + """Checks the client re-raises unknown exceptions as a :exc:`RemoteException`""" + assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == [] + mocker.patch.object( + app_server.storage, + "content_mimetype_get", + side_effect=ValueError("crash"), + ) + with pytest.raises(RemoteException) as e: + swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) + assert not isinstance(e, TransientRemoteException) + + +def test_operationalerror_exception(app_server, swh_indexer_storage, mocker): + """Checks the client re-raises as a :exc:`TransientRemoteException` + rather than the base :exc:`RemoteException`; so the retrying proxy + retries for longer.""" + assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == [] + mocker.patch.object( + app_server.storage, + "content_mimetype_get", + side_effect=psycopg2.errors.AdminShutdown("cluster is shutting down"), + ) + with pytest.raises(RemoteException) as excinfo: + swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) + assert isinstance(excinfo.value, TransientRemoteException) + + +def test_querycancelled_exception(app_server, swh_indexer_storage, mocker): + """Checks the client re-raises as a :exc:`TransientRemoteException` + rather than the base :exc:`RemoteException`; so the retrying proxy + retries for longer.""" + assert swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) == [] + mocker.patch.object( + app_server.storage, + "content_mimetype_get", + side_effect=psycopg2.errors.QueryCanceled("too big!"), + ) + with pytest.raises(RemoteException) as excinfo: + swh_indexer_storage.content_mimetype_get([b"\x01" * 20]) + assert not isinstance(excinfo.value, TransientRemoteException) diff --git a/swh/indexer/tests/storage/test_model.py b/swh/indexer/tests/storage/test_model.py index d33e5294b98770076fcde4c976241e8fbc3d9e79..981546d750ebf5998fed6968246b6bf17e1e192b 100644 --- a/swh/indexer/tests/storage/test_model.py +++ b/swh/indexer/tests/storage/test_model.py @@ -1,26 +1,57 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import pytest + from swh.indexer.storage.model import BaseRow, ContentLicenseRow +def test_unique_key__no_tool_dict(): + with pytest.raises(ValueError, match="indexer_configuration_id"): + BaseRow(id=12, indexer_configuration_id=34).unique_key() + with pytest.raises(ValueError, match="indexer_configuration_id"): + ContentLicenseRow( + id=12, indexer_configuration_id=34, license="BSD" + ).unique_key() + + def test_unique_key(): - assert BaseRow(id=12, indexer_configuration_id=34).unique_key() == { + assert BaseRow( + id=12, tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}} + ).unique_key() == { "id": 12, - "indexer_configuration_id": 34, + "tool_name": "foo", + "tool_version": "1.2.3", + "tool_configuration": "{}", } - assert BaseRow(id=12, tool={"id": 34, "name": "foo"}).unique_key() == { + assert ContentLicenseRow( + id=12, + tool={"id": 34, "name": "foo", "version": "1.2.3", "configuration": {}}, + license="BSD", + ).unique_key() == { "id": 12, - "indexer_configuration_id": 34, + "license": "BSD", + "tool_name": "foo", + "tool_version": "1.2.3", + "tool_configuration": "{}", } assert ContentLicenseRow( - id=12, indexer_configuration_id=34, license="BSD" - ).unique_key() == {"id": 12, "indexer_configuration_id": 34, "license": "BSD"} - - assert ContentLicenseRow( - id=12, tool={"id": 34, "name": "foo"}, license="BSD" - ).unique_key() == {"id": 12, "indexer_configuration_id": 34, "license": "BSD"} + id=12, + tool={ + "id": 34, + "name": "foo", + "version": "1.2.3", + "configuration": {"foo": 1, "bar": 2}, + }, + license="BSD", + ).unique_key() == { + "id": 12, + "license": "BSD", + "tool_name": "foo", + "tool_version": "1.2.3", + "tool_configuration": '{"bar": 2, "foo": 1}', + } diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index a2b671453af7f5a5c08950bcece951745807b4a6..e7d20972e83d4ba62a1f6880f0aa8d5b6c1dd558 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -24,6 +24,15 @@ from swh.indexer.storage.model import ( from swh.model.hashutil import hash_to_bytes +def _remove_tool_ids(rows): + results = [] + for row in rows: + tool = dict(row.tool) + del tool["id"] + results.append(attr.evolve(row, tool=tool)) + return results + + def prepare_mimetypes_from_licenses( fossology_licenses: List[ContentLicenseRow], ) -> List[ContentMimetypeRow]: @@ -358,11 +367,13 @@ class StorageETypeTester: assert actual_data == expected_data + expected_journal_data = _remove_tool_ids(expected_data) + journal_objects = storage.journal_writer.journal.objects # type: ignore actual_journal_data = [ obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type ] - assert list(sorted(actual_journal_data)) == list(sorted(expected_data)) + assert list(sorted(actual_journal_data)) == list(sorted(expected_journal_data)) class TestIndexerStorageContentMimetypes(StorageETypeTester): @@ -574,11 +585,13 @@ class TestIndexerStorageContentMetadata(StorageETypeTester): assert actual_data in (expected_data_postgresql, expected_data_verbatim) + expected_journal_data = _remove_tool_ids(expected_data_verbatim) + journal_objects = storage.journal_writer.journal.objects # type: ignore actual_journal_data = [ obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type ] - assert list(sorted(actual_journal_data)) == list(sorted(expected_data_verbatim)) + assert list(sorted(actual_journal_data)) == list(sorted(expected_journal_data)) class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester): @@ -912,13 +925,17 @@ class TestIndexerStorageOriginIntrinsicMetadata: assert actual_metadata == expected_metadata + expected_journal_metadata = _remove_tool_ids(expected_metadata) + journal_objects = storage.journal_writer.journal.objects # type: ignore actual_journal_metadata = [ obj for (obj_type, obj) in journal_objects if obj_type == "origin_intrinsic_metadata" ] - assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata)) + assert list(sorted(actual_journal_metadata)) == list( + sorted(expected_journal_metadata) + ) def test_origin_intrinsic_metadata_add_update_in_place_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] @@ -1527,13 +1544,17 @@ class TestIndexerStorageOriginExtrinsicMetadata: assert actual_metadata == expected_metadata + expected_journal_metadata = _remove_tool_ids(expected_metadata) + journal_objects = storage.journal_writer.journal.objects # type: ignore actual_journal_metadata = [ obj for (obj_type, obj) in journal_objects if obj_type == "origin_extrinsic_metadata" ] - assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata)) + assert list(sorted(actual_journal_metadata)) == list( + sorted(expected_journal_metadata) + ) def test_origin_extrinsic_metadata_add_update_in_place_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index 6bbab408be74afc82b22c8f2c498ad4fd2c4418d..439a683b15378cad72ae4c1ff93c3c60f875229d 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -28,7 +28,7 @@ from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes from swh.model.model import Content, Origin, OriginVisitStatus -from .test_metadata import REMD +from .test_metadata import GITHUB_REMD from .utils import ( DIRECTORY2, RAW_CONTENT_IDS, @@ -110,6 +110,7 @@ def test_cli_mapping_list(cli_runner, swh_config): "codemeta", "composer", "gemspec", + "gitea", "github", "json-sword-codemeta", "maven", @@ -710,7 +711,7 @@ def test_cli_journal_client_index__origin_extrinsic_metadata( origin = Origin("http://example.org/repo.git") storage.origin_add([origin]) - raw_extrinsic_metadata = attr.evolve(REMD, target=origin.swhid()) + raw_extrinsic_metadata = attr.evolve(GITHUB_REMD, target=origin.swhid()) raw_extrinsic_metadata = attr.evolve( raw_extrinsic_metadata, id=raw_extrinsic_metadata.compute_hash() ) @@ -749,6 +750,7 @@ def test_cli_journal_client_index__origin_extrinsic_metadata( mappings=["github"], metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/", "type": "https://forgefed.org/ns#Repository", "name": "test software", }, diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 20c49c092be38a481c82d8d8cea16385c11a9cb9..61c71cdc39423ffeeb23745d078334add8a897fb 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -59,7 +59,38 @@ DIRECTORY_METADATA_CONFIG = { "tools": TRANSLATOR_TOOL, } -REMD = RawExtrinsicMetadata( +DEPOSIT_REMD = RawExtrinsicMetadata( + target=ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, + object_id=b"\x02" * 20, + ), + discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), + authority=MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url="https://example.org/", + ), + fetcher=MetadataFetcher( + name="example-fetcher", + version="1.0.0", + ), + format="sword-v2-atom-codemeta-v2", + metadata="""<?xml version="1.0"?> + <atom:entry xmlns:atom="http://www.w3.org/2005/Atom" + xmlns="https://doi.org/10.5063/schema/codemeta-2.0"> + <name>My Software</name> + <author> + <name>Author 1</name> + <email>foo@example.org</email> + </author> + <author> + <name>Author 2</name> + </author> + </atom:entry> + """.encode(), + origin="https://example.org/jdoe/myrepo", +) + +GITHUB_REMD = RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.ORIGIN, object_id=b"\x01" * 20, @@ -74,7 +105,7 @@ REMD = RawExtrinsicMetadata( version="1.0.0", ), format="application/vnd.github.v3+json", - metadata=b'{"full_name": "test software"}', + metadata=b'{"full_name": "test software", "html_url": "http://example.org/"}', ) @@ -199,7 +230,7 @@ class TestMetadata: metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") - remd = attr.evolve(REMD, format="unknown format") + remd = attr.evolve(GITHUB_REMD, format="unknown format") results = metadata_indexer.index(remd.id, data=remd) @@ -221,7 +252,7 @@ class TestMetadata: assert tool is not None assert metadata_indexer.process_journal_objects( - {"raw_extrinsic_metadata": [REMD.to_dict()]} + {"raw_extrinsic_metadata": [GITHUB_REMD.to_dict()]} ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} assert metadata_indexer.storage.method_calls == [ @@ -237,22 +268,98 @@ class TestMetadata: tool={"id": tool["id"], **TRANSLATOR_TOOL}, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "id": "http://example.org/", "type": "https://forgefed.org/ns#Repository", "name": "test software", }, - from_remd_id=REMD.id, + from_remd_id=GITHUB_REMD.id, mappings=["github"], ) ] + def test_extrinsic_metadata_indexer_firstparty_deposit(self, mocker): + """Also nominal case, calling the mapping and storing the result""" + origin = "https://example.org/jdoe/myrepo" + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]} + ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} + + assert metadata_indexer.storage.method_calls == [ + call.origin_get_by_sha1( + [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"] + ) + ] + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert results == [ + OriginExtrinsicMetadataRow( + id="https://example.org/jdoe/myrepo", + tool={"id": tool["id"], **TRANSLATOR_TOOL}, + metadata={ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "author": [ + {"email": "foo@example.org", "name": "Author 1"}, + {"name": "Author 2"}, + ], + "name": "My Software", + }, + from_remd_id=DEPOSIT_REMD.id, + mappings=["sword-codemeta"], + ) + ] + + def test_extrinsic_metadata_indexer_thirdparty_deposit(self, mocker): + """Metadata-only deposit: currently ignored""" + origin = "https://not-from-example.org/jdoe/myrepo" + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]} + ) == {"status": "uneventful", "origin_extrinsic_metadata:add": 0} + + assert metadata_indexer.storage.method_calls == [ + call.origin_get_by_sha1( + [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"] + ) + ] + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert results == [] + def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker): """Early abort on non-forge authorities""" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") remd = attr.evolve( - REMD, - authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY), + GITHUB_REMD, + authority=attr.evolve( + GITHUB_REMD.authority, type=MetadataAuthorityType.REGISTRY + ), ) results = metadata_indexer.index(remd.id, data=remd) @@ -275,9 +382,71 @@ class TestMetadata: ) assert tool is not None - results = metadata_indexer.index(REMD.id, data=REMD) + results = metadata_indexer.index(GITHUB_REMD.id, data=GITHUB_REMD) assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1([b"\x01" * 20]) ] assert results == [] + + def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker): + """Two metadata objects with the same origin target""" + origin = "https://example.org/jdoe/myrepo" + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + { + "raw_extrinsic_metadata": [ + GITHUB_REMD.to_dict(), + {**GITHUB_REMD.to_dict(), "id": b"\x00" * 20}, + ] + } + ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert len(results) == 1, results + assert results[0].from_remd_id == b"\x00" * 20 + + def test_extrinsic_directory_metadata_indexer_duplicate_origin(self, mocker): + """Two metadata objects on directories, but with an origin context""" + origin = DEPOSIT_REMD.origin + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + { + "raw_extrinsic_metadata": [ + DEPOSIT_REMD.to_dict(), + { + **DEPOSIT_REMD.to_dict(), + "id": b"\x00" * 20, + "target": "swh:1:dir:" + "01" * 20, + }, + ] + } + ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert len(results) == 1, results + assert results[0].from_remd_id == b"\x00" * 20 diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index 999084bb4982eebdcea0daf7752223a580c3431a..e44ca71244ac28be435815c6ae85fddedda933e2 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from datetime import datetime, timezone +import itertools import pytest @@ -20,6 +21,13 @@ from swh.model.model import ( from swh.model.swhids import CoreSWHID from swh.storage.utils import now + +@pytest.fixture +def swh_storage_backend_config(): + """In-memory storage, to make tests go faster.""" + return {"cls": "memory"} + + SAMPLE_SNAPSHOT = Snapshot( branches={ b"foo": None, @@ -31,6 +39,28 @@ SAMPLE_SNAPSHOT = Snapshot( ) +def _add_snapshot_to_origin(storage, origin_url, visit_type, snapshot): + storage.origin_add([Origin(url=origin_url)]) + visit = storage.origin_visit_add( + [ + OriginVisit( + origin=origin_url, + date=datetime(2019, 2, 27, tzinfo=timezone.utc), + type="pypi", + ) + ] + )[0] + storage.snapshot_add([snapshot]) + visit_status = OriginVisitStatus( + origin=origin_url, + visit=visit.visit, + date=now(), + status="full", + snapshot=snapshot.id, + ) + storage.origin_visit_status_add([visit_status]) + + @pytest.fixture def storage(swh_storage): fill_storage(swh_storage) @@ -77,31 +107,115 @@ def test_vcs_missing_snapshot(storage): def test_pypi_missing_branch(storage): origin_url = "https://pypi.org/project/abcdef/" - storage.origin_add( - [ - Origin( - url=origin_url, - ) - ] + _add_snapshot_to_origin(storage, origin_url, "pypi", SAMPLE_SNAPSHOT) + assert get_head_swhid(storage, origin_url) is None + + +@pytest.mark.parametrize( + "branches_start,branches_middle,branches_end", + itertools.product([0, 40, 99, 100, 200], [0, 40, 99, 100, 200], [0, 40, 200]), +) +def test_large_snapshot(storage, branches_start, branches_middle, branches_end): + rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + snapshot = Snapshot( + branches=dict( + [(f"AAAA{i}".encode(), None) for i in range(branches_start)] + + [ + ( + b"HEAD", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/foo" + ), + ) + ] + + [(f"aaaa{i}".encode(), None) for i in range(branches_middle)] + + [ + ( + b"refs/heads/foo", + SnapshotBranch( + target_type=TargetType.REVISION, + target=bytes.fromhex(rev_id), + ), + ) + ] + + [(f"zzzz{i}".encode(), None) for i in range(branches_end)] + ) ) - visit = storage.origin_visit_add( - [ - OriginVisit( - origin=origin_url, - date=datetime(2019, 2, 27, tzinfo=timezone.utc), - type="pypi", - ) - ] - )[0] - storage.snapshot_add([SAMPLE_SNAPSHOT]) - visit_status = OriginVisitStatus( - origin=origin_url, - visit=visit.visit, - date=now(), - status="full", - snapshot=SAMPLE_SNAPSHOT.id, + + origin_url = "https://example.org/repo.git" + _add_snapshot_to_origin(storage, origin_url, "git", snapshot) + + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" ) - storage.origin_visit_status_add([visit_status]) + + +def test_large_snapshot_chained_aliases(storage): + rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + snapshot = Snapshot( + branches=dict( + [(f"AAAA{i}".encode(), None) for i in range(200)] + + [ + ( + b"HEAD", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/alias2" + ), + ) + ] + + [(f"aaaa{i}".encode(), None) for i in range(200)] + + [ + ( + b"refs/heads/alias2", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/branch" + ), + ) + ] + + [(f"refs/heads/bbbb{i}".encode(), None) for i in range(200)] + + [ + ( + b"refs/heads/branch", + SnapshotBranch( + target_type=TargetType.REVISION, + target=bytes.fromhex(rev_id), + ), + ) + ] + ) + ) + + origin_url = "https://example.org/repo.git" + _add_snapshot_to_origin(storage, origin_url, "git", snapshot) + + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + ) + + +@pytest.mark.parametrize( + "branches_start,branches_end", + itertools.product([0, 40, 99, 100, 200], [0, 40, 200]), +) +def test_large_snapshot_dangling_alias(storage, branches_start, branches_end): + snapshot = Snapshot( + branches=dict( + [(f"AAAA{i}".encode(), None) for i in range(branches_start)] + + [ + ( + b"HEAD", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/foo" + ), + ) + ] + + [(f"zzzz{i}".encode(), None) for i in range(branches_end)] + ) + ) + + origin_url = "https://example.org/repo.git" + _add_snapshot_to_origin(storage, origin_url, "git", snapshot) + assert get_head_swhid(storage, origin_url) is None diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 567f479adbc8bf2165326ee34d903c7239e8f414..4b7057e1796215525142e1141dc9f942169a79bb 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,6 +6,7 @@ import copy from unittest.mock import patch +import attr import pytest from swh.indexer.metadata import OriginMetadataIndexer @@ -213,6 +214,58 @@ def test_origin_metadata_indexer_duplicate_directory( assert len(orig_results) == 2 +def test_origin_metadata_indexer_duplicate_directory_different_result( + swh_indexer_config, + idx_storage: IndexerStorageInterface, + storage: StorageInterface, + obj_storage, + mocker, +) -> None: + """Same as above, but indexing the same directory twice resulted in different + data (because list order differs). + """ + indexer = OriginMetadataIndexer(config=swh_indexer_config) + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.catch_exceptions = False + origin1 = "https://github.com/librariesio/yarn-parser" + origin2 = "https://github.com/librariesio/yarn-parser.git" + + directory_index = indexer.directory_metadata_indexer.index + + nb_calls = 0 + + def side_effect(dir_id): + nonlocal nb_calls + if nb_calls == 0: + keywords = ["foo", "bar"] + elif nb_calls == 1: + keywords = ["bar", "foo"] + else: + assert False, nb_calls + nb_calls += 1 + return [ + attr.evolve(row, metadata={**row.metadata, "keywords": keywords}) + for row in directory_index(dir_id) + ] + + mocker.patch.object( + indexer.directory_metadata_indexer, "index", side_effect=side_effect + ) + + indexer.run([origin1, origin2]) + + dir_id = DIRECTORY2.id + + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert len(dir_results) == 1 + + orig_results = list( + indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) + ) + assert len(orig_results) == 2 + + def test_origin_metadata_indexer_no_metadata_file( swh_indexer_config, idx_storage: IndexerStorageInterface, diff --git a/tox.ini b/tox.ini index f0bda88c43a56d26850573d587eba2812e509376..b135fcc60c7362d419d26c5b2464ef89692ec11f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,4 +1,6 @@ [tox] +requires = + tox>4 envlist=black,flake8,mypy,py3 [testenv] @@ -20,15 +22,16 @@ commands = [testenv:black] skip_install = true deps = - black==22.3.0 + black==22.10.0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = - flake8==4.0.1 - flake8-bugbear==22.3.23 + flake8==5.0.4 + flake8-bugbear==22.9.23 + pycodestyle==2.9.1 commands = {envpython} -m flake8 @@ -36,7 +39,7 @@ commands = extras = testing deps = - mypy==0.942 + mypy==1.0 commands = mypy swh @@ -44,14 +47,12 @@ commands = # git HEAD of swh-docs, is executed on CI for each diff to prevent # breaking doc build [testenv:sphinx] -whitelist_externals = make +allowlist_externals = make usedevelop = true extras = testing deps = - # fetch and install swh-docs in develop mode - -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs - + -e git+https://gitlab.softwareheritage.org/swh/devel/swh-docs.git\#egg=swh.docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors @@ -59,18 +60,16 @@ setenv = commands = make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs - # build documentation only inside swh-environment using local state # of swh-docs package [testenv:sphinx-dev] -whitelist_externals = make +allowlist_externals = make usedevelop = true extras = testing deps = # install swh-docs in develop mode -e ../swh-docs - setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors