diff --git a/conftest.py b/conftest.py index b4a6d0aeea533c93ed020dc2ac2deee5ed487075..a58bf2b3ad74063a08d44e2663716c7d1fbff5ba 100644 --- a/conftest.py +++ b/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -21,6 +21,5 @@ def swh_scheduler_celery_includes(swh_scheduler_celery_includes): "swh.loader.package.deposit.tasks", "swh.loader.package.npm.tasks", "swh.loader.package.pypi.tasks", - "swh.loader.package.nixguix.tasks", "swh.loader.package.maven.tasks", ] diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst index bb4c7015282fa8c81a06e8648e690b0ae913aad9..a9701725f3b4fa1b1855ec023f76732bfb82e26e 100644 --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -159,15 +159,6 @@ Here is an overview of the fields (+ internal version name + branch name) used b - "" - passed as arg - Only one artefact per url (jar/zip src) - * - nixguix - - URL - - URL - - URL - - None - - true - - "" - - None - - it's the URL of the artifact referenced by the derivation * - npm - ``metadata​["version"]`` - ``release_name(​version)`` diff --git a/pyproject.toml b/pyproject.toml index e075b51e08fdfef3bd1b903c2862f791f0a029df..df6981716eda481efa8fb80426fedf5ea0193daa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,6 @@ testing = {file = ["requirements-test.txt"]} "loader.golang" = "swh.loader.package.golang:register" "loader.hackage" = "swh.loader.package.hackage:register" "loader.hex" = "swh.loader.package.hex:register" -"loader.nixguix" = "swh.loader.package.nixguix:register" "loader.npm" = "swh.loader.package.npm:register" "loader.opam" = "swh.loader.package.opam:register" "loader.pubdev" = "swh.loader.package.pubdev:register" diff --git a/swh/loader/package/nixguix/README b/swh/loader/package/nixguix/README new file mode 100644 index 0000000000000000000000000000000000000000..510c92891433280187ae8a74fe98d91eb6e1169c --- /dev/null +++ b/swh/loader/package/nixguix/README @@ -0,0 +1,9 @@ +This loader no longer exists. + +It has been removed to the benefit of the loaders: +- swh.loader.core.loader.ContentLoader +- swh.loader.core.loader.TarballDirectoryLoader +- swh.loader.git.directory.GitCheckoutLoader +- swh.loader.mercurial.directory.HgCheckoutLoader +- swh.loader.svn.directory.SvnExportLoader + diff --git a/swh/loader/package/nixguix/__init__.py b/swh/loader/package/nixguix/__init__.py deleted file mode 100644 index a82f24b237b4a3f3dcd5e0a6a613fa04c8091fe8..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -from typing import Any, Mapping - - -def register() -> Mapping[str, Any]: - """Register the current worker module's definition""" - from .loader import NixGuixLoader - - return { - "task_modules": [f"{__name__}.tasks"], - "loader": NixGuixLoader, - } diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py deleted file mode 100644 index 46eeaf0f735bb8de8853af0380202c5debb2098b..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/loader.py +++ /dev/null @@ -1,308 +0,0 @@ -# Copyright (C) 2020-2021 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import copy -import json -import logging -import re -from typing import Any, Dict, Iterator, List, Mapping, Optional, Set, Tuple - -import attr - -from swh.loader.package.loader import ( - BasePackageInfo, - PackageLoader, - PartialExtID, - RawExtrinsicMetadataCore, -) -from swh.loader.package.utils import EMPTY_AUTHOR, cached_method, get_url_body -from swh.model import hashutil -from swh.model.model import ( - MetadataAuthority, - MetadataAuthorityType, - ObjectType, - Release, - Sha1Git, -) -from swh.model.swhids import CoreSWHID -from swh.storage.interface import StorageInterface - -logger = logging.getLogger(__name__) - -EXTID_TYPE = "subresource-integrity" -"""The ExtID is an ASCII string, as defined by -https://w3c.github.io/webappsec-subresource-integrity/""" - -EXTID_VERSION = 0 - - -@attr.s -class NixGuixPackageInfo(BasePackageInfo): - raw_info = attr.ib(type=Dict[str, Any]) - - integrity = attr.ib(type=str) - """Hash of the archive, formatted as in the Subresource Integrity - specification.""" - - @classmethod - def from_metadata( - cls, metadata: Dict[str, Any], version: str - ) -> "NixGuixPackageInfo": - return cls( - url=metadata["url"], - filename=None, - version=version, - integrity=metadata["integrity"], - raw_info=metadata, - ) - - def extid(self) -> PartialExtID: - return (EXTID_TYPE, EXTID_VERSION, self.integrity.encode("ascii")) - - -class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): - """Load sources from a sources.json file. This loader is used to load - sources used by functional package manager (eg. Nix and Guix). - - """ - - visit_type = "nixguix" - - def __init__( - self, - storage: StorageInterface, - url: str, - unsupported_file_extensions: List[str] = [], - **kwargs: Any, - ): - super().__init__(storage=storage, url=url, **kwargs) - self.provider_url = url - self.unsupported_file_extensions = unsupported_file_extensions - - # Note: this could be renamed get_artifacts in the PackageLoader - # base class. - @cached_method - def raw_sources(self): - return retrieve_sources(self.origin.url) - - @cached_method - def supported_sources(self): - raw_sources = self.raw_sources() - return clean_sources( - parse_sources(raw_sources), self.unsupported_file_extensions - ) - - @cached_method - def integrity_by_url(self) -> Dict[str, str]: - sources = self.supported_sources() - return {s["urls"][0]: s["integrity"] for s in sources["sources"]} - - def get_versions(self) -> List[str]: - """The first mirror of the mirror list is used as branch name in the - snapshot. - - """ - return list(self.integrity_by_url().keys()) - - def get_metadata_authority(self): - return MetadataAuthority( - type=MetadataAuthorityType.FORGE, - url=self.origin.url, - metadata={}, - ) - - def get_extrinsic_snapshot_metadata(self): - return [ - RawExtrinsicMetadataCore( - format="nixguix-sources-json", - metadata=self.raw_sources(), - ), - ] - - # Note: this could be renamed get_artifact_info in the PackageLoader - # base class. - def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: - # TODO: try all mirrors and not only the first one. A source - # can be fetched from several urls, called mirrors. We - # currently only use the first one, but if the first one - # fails, we should try the second one and so on. - integrity = self.integrity_by_url()[url] - p_info = NixGuixPackageInfo.from_metadata( - {"url": url, "integrity": integrity}, version=url - ) - yield url, p_info - - def select_extid_target( - self, p_info: NixGuixPackageInfo, extid_targets: Set[CoreSWHID] - ) -> Optional[CoreSWHID]: - if extid_targets: - # The archive URL is part of the release name. As that URL is not - # intrinsic metadata, it means different releases may be created for - # the same SRI so they have the same extid. - # Therefore, we need to pick the one with the right URL. - releases = self.storage.release_get( - [target.object_id for target in extid_targets] - ) - extid_targets = { - release.swhid() - for release in releases - if release is not None and release.name == p_info.version.encode() - } - return super().select_extid_target(p_info, extid_targets) - - def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: - """We add a branch to the snapshot called 'evaluation' pointing to the - revision used to generate the sources.json file. This revision - is specified in the sources.json file itself. For the nixpkgs - origin, this revision is coming from the - github.com/nixos/nixpkgs repository. - - Note this repository is not loaded explicitly. So, this - pointer can target a nonexistent revision for a time. However, - the github and gnu loaders are supposed to load this revision - and should create the revision pointed by this branch. - - This branch can be used to identify the snapshot associated to - a Nix/Guix evaluation. - - """ - # The revision used to create the sources.json file. For Nix, - # this revision belongs to the github.com/nixos/nixpkgs - # repository - revision = self.supported_sources()["revision"] - return { - b"evaluation": { - "target_type": "revision", - "target": hashutil.hash_to_bytes(revision), - } - } - - def build_release( - self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git - ) -> Optional[Release]: - return Release( - name=p_info.version.encode(), - message=None, - author=EMPTY_AUTHOR, - date=None, - target=directory, - target_type=ObjectType.DIRECTORY, - synthetic=True, - ) - - -def retrieve_sources(url: str) -> bytes: - """Retrieve sources. Potentially raise NotFound error.""" - return get_url_body(url, allow_redirects=True) - - -def parse_sources(raw_sources: bytes) -> Dict[str, Any]: - return json.loads(raw_sources.decode("utf-8")) - - -def make_pattern_unsupported_file_extension( - unsupported_file_extensions: List[str], -): - """Make a regexp pattern for unsupported file extension out of a list - of unsupported archive extension list. - - """ - return re.compile( - rf".*\.({'|'.join(map(re.escape, unsupported_file_extensions))})$", re.DOTALL - ) - - -def clean_sources( - sources: Dict[str, Any], unsupported_file_extensions=[] -) -> Dict[str, Any]: - """Validate and clean the sources structure. First, ensure all top level keys are - present. Then, walk the sources list and remove sources that do not contain required - keys. - - Filter out source entries whose: - - required keys are missing - - source type is not supported - - urls attribute type is not a list - - extension is known not to be supported by the loader - - Raises: - ValueError if: - - a required top level key is missing - - top-level version is not 1 - - Returns: - source Dict cleaned up - - """ - pattern_unsupported_file = make_pattern_unsupported_file_extension( - unsupported_file_extensions - ) - # Required top level keys - required_keys = ["version", "revision", "sources"] - missing_keys = [] - for required_key in required_keys: - if required_key not in sources: - missing_keys.append(required_key) - - if missing_keys != []: - raise ValueError( - f"sources structure invalid, missing: {','.join(missing_keys)}" - ) - - # Only the version 1 is currently supported - version = int(sources["version"]) - if version != 1: - raise ValueError( - f"The sources structure version '{sources['version']}' is not supported" - ) - - # If a source doesn't contain required attributes, this source is - # skipped but others could still be archived. - verified_sources = [] - for source in sources["sources"]: - valid = True - required_keys = ["urls", "integrity", "type"] - for required_key in required_keys: - if required_key not in source: - logger.info( - f"Skip source '{source}' because key '{required_key}' is missing", - ) - valid = False - - if valid and source["type"] != "url": - logger.info( - f"Skip source '{source}' because the type {source['type']} " - "is not supported", - ) - valid = False - - if valid and not isinstance(source["urls"], list): - logger.info( - f"Skip source {source} because the urls attribute is not a list" - ) - valid = False - - if valid and len(source["urls"]) > 0: # Filter out unsupported archives - supported_sources: List[str] = [] - for source_url in source["urls"]: - if pattern_unsupported_file.match(source_url): - logger.info(f"Skip unsupported artifact url {source_url}") - continue - supported_sources.append(source_url) - - if len(supported_sources) == 0: - logger.info( - f"Skip source {source} because urls only reference " - "unsupported artifacts. Unsupported " - f"artifacts so far: {pattern_unsupported_file}" - ) - continue - - new_source = copy.deepcopy(source) - new_source["urls"] = supported_sources - verified_sources.append(new_source) - - sources["sources"] = verified_sources - return sources diff --git a/swh/loader/package/nixguix/tasks.py b/swh/loader/package/nixguix/tasks.py deleted file mode 100644 index 44abd639471570d53ac708bf09b36553b4712524..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tasks.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (C) 2020-2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from celery import shared_task - -from swh.loader.package.nixguix.loader import NixGuixLoader - - -@shared_task(name=__name__ + ".LoadNixguix") -def load_nixguix(**kwargs): - """Load functional (e.g. guix/nix) package""" - return NixGuixLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/nixguix/tests/__init__.py b/swh/loader/package/nixguix/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/swh/loader/package/nixguix/tests/conftest.py b/swh/loader/package/nixguix/tests/conftest.py deleted file mode 100644 index 46c5be4528bc086c965843d9afae4ca31eb9537b..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/conftest.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (C) 2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from typing import Any, Dict - -import pytest - - -@pytest.fixture -def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: - # nixguix loader needs a pg-storage backend because some tests share data - return { - "storage": swh_storage_backend_config, - "unsupported_file_extensions": [ - "patch", - "iso", - "whl", - "gem", - "pom", - "msi", - "pod", - "png", - "rock", - "ttf", - "jar", - "c", - "el", - "rpm", - "diff", - ], - } diff --git a/swh/loader/package/nixguix/tests/data/https_example.com/file.txt b/swh/loader/package/nixguix/tests/data/https_example.com/file.txt deleted file mode 100644 index d95f3ad14dee633a758d2e331151e950dd13e4ed..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_example.com/file.txt +++ /dev/null @@ -1 +0,0 @@ -content diff --git a/swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz b/swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz deleted file mode 100644 index 958841ca0e374281722f9431ed36d4ef0bee6d1a..0000000000000000000000000000000000000000 Binary files a/swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz and /dev/null differ diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz deleted file mode 100644 index 2848fb9110c1c2b10ce0990466243265ee50afaf..0000000000000000000000000000000000000000 Binary files a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz and /dev/null differ diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 deleted file mode 120000 index d1bbde9d31ef080bb50e11269ff0af34c05a966e..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 +++ /dev/null @@ -1 +0,0 @@ -gnu_8sync_8sync-0.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 deleted file mode 120000 index d1bbde9d31ef080bb50e11269ff0af34c05a966e..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 +++ /dev/null @@ -1 +0,0 @@ -gnu_8sync_8sync-0.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz deleted file mode 100644 index ad9cbfac0923f612f0923cc178ad363837da8d73..0000000000000000000000000000000000000000 Binary files a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz and /dev/null differ diff --git a/swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz b/swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz deleted file mode 100644 index 0ead277251175e7163afd84620528f28119d8f16..0000000000000000000000000000000000000000 Binary files a/swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz and /dev/null differ diff --git a/swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz b/swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz deleted file mode 100644 index 8b47ea34b35a36326bf73604d3468ca628ce6ceb..0000000000000000000000000000000000000000 Binary files a/swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz and /dev/null differ diff --git a/swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz b/swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz deleted file mode 100644 index 3bb121207f786a7b6b3e7f38841adc412045327f..0000000000000000000000000000000000000000 Binary files a/swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz and /dev/null differ diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json deleted file mode 100644 index 125bc60583f9ef561ee1ec08c45a5c2fd2144945..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "sources": [ - { - "type": "url", - "urls": [ "https://fail.com/truncated-archive.tgz" ], - "integrity": "sha256-UB+RzIn63O0WxzqohYeWZRRzYCxyK7Kfhqi6WI0P8bE=" - } - ], - "version": 1, - "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" -} diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json deleted file mode 100644 index aaa6d1c38d12d3584bd73ea6fcc16f08fc0f9500..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "sources": [ - { - "type": "url", - "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], - "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" - }, - { - "type": "url", - "urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ], - "integrity": "sha256-sovQhmpumj0DpISN2c1QCLY6esueeV81zR/1CeUC13Q=" - }, - { - "type": "url", - "urls": [ "https://example.com/file.txt" ], - "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" - } - ], - "version": "1", - "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" -} diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 deleted file mode 100644 index a658c2568f8cdd439d2f3d668cb396b8e05a28e0..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 +++ /dev/null @@ -1,21 +0,0 @@ -{ - "sources": [ - { - "type": "url", - "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], - "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" - }, - { - "type": "url", - "urls": [ "https://github.com/owner-2/repository-1/revision-1.tgz" ], - "integrity": "sha256-+vRlzTcnhMlynJGGMuAgMnUGdjpSqGabhcQ/SlRplAE=" - }, - { - "type": "url", - "urls": [ "https://example.com/file.txt" ], - "integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM=" - } - ], - "version": 1, - "revision": "602140776b2ce6c9159bcf52ada73a297c063d5e" -} diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json deleted file mode 100644 index a4871b4d8a4e4d897dde824e0411cecb811a5cbb..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "sources": [ - { - "type": "url", - "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ], - "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" - }, - { - "type": "url", - "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], - "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" - } - ], - "version": 1, - "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" -} diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 deleted file mode 100644 index 54cc93b3ea4a2bea4f51a31fec7431c53cc5cf61..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 +++ /dev/null @@ -1,16 +0,0 @@ -{ - "sources": [ - { - "type": "url", - "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ], - "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" - }, - { - "type": "url", - "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz" ], - "integrity": "sha256-4wn2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs=" - } - ], - "version": 1, - "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7" -} diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py deleted file mode 100644 index 334f62bfe49f8d6ecededb50fd3df0200b630bbe..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ /dev/null @@ -1,655 +0,0 @@ -# Copyright (C) 2020-2023 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import json -import logging -import os -from typing import Dict, Optional, Tuple - -import pytest - -from swh.loader.package import __version__ -from swh.loader.package.archive.loader import ArchiveLoader -from swh.loader.package.nixguix.loader import ( - NixGuixLoader, - clean_sources, - make_pattern_unsupported_file_extension, - parse_sources, - retrieve_sources, -) -from swh.loader.package.utils import download -from swh.loader.tests import assert_last_visit_matches -from swh.loader.tests import check_snapshot as check_snapshot_full -from swh.loader.tests import get_stats -from swh.model.hashutil import hash_to_bytes -from swh.model.model import ( - MetadataAuthority, - MetadataAuthorityType, - MetadataFetcher, - ObjectType, - Person, - RawExtrinsicMetadata, - Release, - Snapshot, - SnapshotBranch, - TargetType, -) -from swh.model.swhids import ExtendedObjectType, ExtendedSWHID -from swh.storage.algos.origin import origin_get_latest_visit_status -from swh.storage.algos.snapshot import snapshot_get_all_branches -from swh.storage.exc import HashCollision -from swh.storage.interface import PagedResult, StorageInterface - -sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" - - -@pytest.fixture -def raw_sources(datadir) -> bytes: - with open( - os.path.join( - datadir, "https_nix-community.github.io", "nixpkgs-swh_sources.json" - ), - "rb", - ) as f: - return f.read() - - -SNAPSHOT1 = Snapshot( - branches={ - b"evaluation": SnapshotBranch( - target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), - target_type=TargetType.REVISION, - ), - b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f"), - target_type=TargetType.RELEASE, - ), - b"https://github.com/owner-3/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("adbc07c7d5654aa9ecf7b4fd8ec79083477f5b51"), - target_type=TargetType.RELEASE, - ), - }, -) - - -def check_snapshot(snapshot: Snapshot, storage: StorageInterface): - # The `evaluation` branch is allowed to be unresolvable. It's possible at current - # nixguix visit time, it is not yet visited (the git loader is in charge of its - # visit for now). For more details, check the - # swh.loader.package.nixguix.NixGuixLoader.extra_branches docstring. - check_snapshot_full( - snapshot, storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] - ) - - assert isinstance(snapshot, Snapshot) - # then ensure the snapshot revisions are structurally as expected - revision_ids = [] - for name, branch in snapshot.branches.items(): - if name == b"evaluation": - continue # skipping that particular branch (cf. previous comment) - if branch.target_type == TargetType.REVISION: - revision_ids.append(branch.target) - - revisions = storage.revision_get(revision_ids) - for rev in revisions: - assert rev is not None - metadata = rev.metadata - assert not metadata - - -def test_retrieve_sources(swh_storage, requests_mock_datadir): - j = parse_sources(retrieve_sources(sources_url)) - assert "sources" in j.keys() - assert len(j["sources"]) == 3 - - -def test_nixguix_url_not_found(swh_storage, requests_mock_datadir): - """When failing to read from the url, the visit is marked as not_found. - - Here the sources url does not exist, so requests_mock_datadir returns a 404. - Resulting in a NotFound raised within the package loader's main loop. - - This results in the task with status failed and a visit_status with status - "not_found". - - """ - unknown_url = "https://non-existing-url/" - loader = NixGuixLoader(swh_storage, unknown_url) - # during the retrieval step - load_status = loader.load() - - assert load_status == {"status": "failed"} - - assert_last_visit_matches( - swh_storage, unknown_url, status="not_found", type="nixguix", snapshot=None - ) - - assert len(requests_mock_datadir.request_history) == 1 - assert requests_mock_datadir.request_history[0].url == unknown_url - - -def test_nixguix_url_with_decoding_error(swh_storage, requests_mock_datadir): - """Other errors during communication with the url, the visit is marked as failed - - requests_mock_datadir will intercept the requests to sources_url. Since the file - exists, returns a 200 with the requested content of the query. As file.txt is no - json, fails do decode and raises a JSONDecodeError. In effect failing the visit. - - """ - sources_url = "https://example.com/file.txt" - loader = NixGuixLoader(swh_storage, sources_url) - load_status = loader.load() - - assert load_status == {"status": "failed"} - - assert_last_visit_matches( - swh_storage, sources_url, status="failed", type="nixguix", snapshot=None - ) - - assert len(requests_mock_datadir.request_history) == 1 - assert requests_mock_datadir.request_history[0].url == sources_url - - -def test_clean_sources_invalid_schema(swh_storage, requests_mock_datadir): - sources = {} - with pytest.raises(ValueError, match="sources structure invalid, missing: .*"): - clean_sources(sources) - - -def test_clean_sources_invalid_version(swh_storage, requests_mock_datadir): - for version_ok in [1, "1"]: # Check those versions are fine - clean_sources({"version": version_ok, "sources": [], "revision": "my-revision"}) - - for version_ko in [0, "0", 2, "2"]: # Check version != 1 raise an error - with pytest.raises( - ValueError, match="sources structure version .* is not supported" - ): - clean_sources( - {"version": version_ko, "sources": [], "revision": "my-revision"} - ) - - -def test_clean_sources_invalid_sources(swh_storage, requests_mock_datadir): - valid_sources = [ - # 1 valid source - {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, - ] - sources = { - "version": 1, - "sources": valid_sources - + [ - # integrity is missing - { - "type": "url", - "urls": ["my-url.tgz"], - }, - # urls is not a list - {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"}, - # type is not url - {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"}, - # missing fields which got double-checked nonetheless... - {"integrity": "my-integrity"}, - ], - "revision": "my-revision", - } - clean = clean_sources(sources) - - assert len(clean["sources"]) == len(valid_sources) - - -def test_make_pattern_unsupported_file_extension(): - unsupported_extensions = ["el", "c", "txt"] - supported_extensions = ["Z", "7z"] # for test - - actual_unsupported_pattern = make_pattern_unsupported_file_extension( - unsupported_extensions - ) - - for supported_ext in supported_extensions: - assert supported_ext not in unsupported_extensions - - supported_filepath = f"anything.{supported_ext}" - actual_match = actual_unsupported_pattern.match(supported_filepath) - assert not actual_match - - for unsupported_ext in unsupported_extensions: - unsupported_filepath = f"something.{unsupported_ext}" - actual_match = actual_unsupported_pattern.match(unsupported_filepath) - assert actual_match - - -def test_clean_sources_unsupported_artifacts(swh_storage, requests_mock_datadir): - unsupported_file_extensions = [ - "iso", - "whl", - "gem", - "pom", - "msi", - "pod", - "png", - "rock", - "ttf", - "jar", - "c", - "el", - "rpm", - "diff", - "patch", - ] - supported_sources = [ - { - "type": "url", - "urls": [f"https://server.org/my-url.{ext}"], - "integrity": "my-integrity", - } - for ext in [ - "known-unknown-but-ok", # this is fine as well with the current approach - "zip", - "tar.gz", - "tgz", - "tar.bz2", - "tbz", - "tbz2", - "tar.xz", - "tar", - "zip", - "7z", - "Z", - ] - ] - - unsupported_sources = [ - { - "type": "url", - "urls": [f"https://server.org/my-url.{ext}"], - "integrity": "my-integrity", - } - for ext in unsupported_file_extensions - ] - - sources = { - "version": 1, - "sources": supported_sources + unsupported_sources, - "revision": "my-revision", - } - - clean = clean_sources(sources, unsupported_file_extensions) - - assert len(clean["sources"]) == len(supported_sources) - - -def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources): - loader = NixGuixLoader(swh_storage, sources_url) - load_status = loader.load() - expected_snapshot_id = SNAPSHOT1.id - expected_snapshot_id_hex = expected_snapshot_id.hex() - assert load_status == { - "status": "eventful", - "snapshot_id": expected_snapshot_id_hex, - } - - release_id = SNAPSHOT1.branches[ - b"https://github.com/owner-1/repository-1/revision-1.tgz" - ].target - check_snapshot(SNAPSHOT1, storage=swh_storage) - - assert swh_storage.release_get([release_id])[0] == Release( - id=release_id, - name=b"https://github.com/owner-1/repository-1/revision-1.tgz", - message=None, - target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"), - target_type=ObjectType.DIRECTORY, - synthetic=True, - author=Person.from_fullname(b""), - date=None, - ) - - stats = get_stats(swh_storage) - assert { - "content": 2, - "directory": 5, - "origin": 1, - "origin_visit": 1, - "release": 2, - "revision": 0, - "skipped_content": 0, - "snapshot": 1, - } == stats - - # The visit is partial because urls pointing to non tarball file - # are not handled yet - assert_last_visit_matches( - swh_storage, sources_url, status="partial", type="nixguix" - ) - - visit_status = origin_get_latest_visit_status(swh_storage, sources_url) - snapshot_swhid = ExtendedSWHID( - object_type=ExtendedObjectType.SNAPSHOT, object_id=visit_status.snapshot - ) - metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.FORGE, - url=sources_url, - ) - expected_metadata = [ - RawExtrinsicMetadata( - target=snapshot_swhid, - authority=metadata_authority, - fetcher=MetadataFetcher( - name="swh.loader.package.nixguix.loader.NixGuixLoader", - version=__version__, - ), - discovery_date=loader.visit_date, - format="nixguix-sources-json", - metadata=raw_sources, - origin=sources_url, - ) - ] - assert swh_storage.raw_extrinsic_metadata_get( - snapshot_swhid, - metadata_authority, - ) == PagedResult( - next_page_token=None, - results=expected_metadata, - ) - - -def test_uncompress_failure(swh_storage, requests_mock_datadir): - """Non tarball files are currently not supported and the uncompress - function fails on such kind of files. - - However, even in this case of failure (because of the url - https://example.com/file.txt), a snapshot and a visit has to be - created (with a status partial since all files are not archived). - - """ - loader = NixGuixLoader(swh_storage, sources_url) - loader_status = loader.load() - - sources = loader.supported_sources()["sources"] - urls = [s["urls"][0] for s in sources] - assert "https://example.com/file.txt" in urls - assert loader_status["status"] == "eventful" - - # The visit is partial because urls pointing to non tarball files - # are not handled yet - assert_last_visit_matches( - swh_storage, sources_url, status="partial", type="nixguix" - ) - - -def test_loader_incremental(swh_storage, requests_mock_datadir): - """Ensure a second visit do not download artifact already - downloaded by the previous visit. - - """ - loader = NixGuixLoader(swh_storage, sources_url) - load_status = loader.load() - - loader.load() - assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} - - assert_last_visit_matches( - swh_storage, - sources_url, - status="partial", - type="nixguix", - snapshot=SNAPSHOT1.id, - ) - - check_snapshot(SNAPSHOT1, storage=swh_storage) - - urls = [ - m.url - for m in requests_mock_datadir.request_history - if m.url == ("https://github.com/owner-1/repository-1/revision-1.tgz") - ] - # The artifact - # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only - # visited one time - assert len(urls) == 1 - - -def test_loader_two_visits(swh_storage, requests_mock_datadir_visits): - """To ensure there is only one origin, but two visits, two revisions - and two snapshots are created. - - The first visit creates a snapshot containing one tarball. The - second visit creates a snapshot containing the same tarball and - another tarball. - - """ - loader = NixGuixLoader(swh_storage, sources_url) - load_status = loader.load() - assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} - - assert_last_visit_matches( - swh_storage, - sources_url, - status="partial", - type="nixguix", - snapshot=SNAPSHOT1.id, - ) - - check_snapshot(SNAPSHOT1, storage=swh_storage) - - stats = get_stats(swh_storage) - assert { - "content": 2, - "directory": 5, - "origin": 1, - "origin_visit": 1, - "release": 2, - "revision": 0, - "skipped_content": 0, - "snapshot": 1, - } == stats - - loader = NixGuixLoader(swh_storage, sources_url) - load_status = loader.load() - - expected_snapshot_id_hex = "c1983a0a3f647548e1fb92f30339da6848fe9f7a" - expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) - assert load_status == { - "status": "eventful", - "snapshot_id": expected_snapshot_id_hex, - } - - assert_last_visit_matches( - swh_storage, - sources_url, - status="partial", - type="nixguix", - snapshot=expected_snapshot_id, - ) - - # This ensures visits are incremental. Indeed, if we request a - # second time an url, because of the requests_mock_datadir_visits - # fixture, the file has to end with `_visit1`. - expected_snapshot = Snapshot( - id=expected_snapshot_id, - branches={ - b"evaluation": SnapshotBranch( - target=hash_to_bytes("602140776b2ce6c9159bcf52ada73a297c063d5e"), - target_type=TargetType.REVISION, - ), - b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f"), - target_type=TargetType.RELEASE, - ), - b"https://github.com/owner-2/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("5cc0115cd643902b837cb6cfbc9f5865bc5a7cb2"), - target_type=TargetType.RELEASE, - ), - }, - ) - check_snapshot(expected_snapshot, storage=swh_storage) - - stats = get_stats(swh_storage) - assert { - "content": 3, - "directory": 7, - "origin": 1, - "origin_visit": 2, - "release": 3, - "revision": 0, - "skipped_content": 0, - "snapshot": 2, - } == stats - - -def test_evaluation_branch(swh_storage, requests_mock_datadir): - loader = NixGuixLoader(swh_storage, sources_url) - res = loader.load() - assert res["status"] == "eventful" - - assert_last_visit_matches( - swh_storage, - sources_url, - status="partial", - type="nixguix", - snapshot=SNAPSHOT1.id, - ) - - check_snapshot(SNAPSHOT1, storage=swh_storage) - - -def test_eoferror(swh_storage, requests_mock_datadir): - """Load a truncated archive which is invalid to make the uncompress - function raising the exception EOFError. We then check if a - snapshot is created, meaning this error is well managed. - - """ - sources = ( - "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa - ) - loader = NixGuixLoader(swh_storage, sources) - loader.load() - - expected_snapshot = Snapshot( - id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"), - branches={ - b"evaluation": SnapshotBranch( - target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), - target_type=TargetType.REVISION, - ), - }, - ) - - check_snapshot(expected_snapshot, storage=swh_storage) - - -def fake_download( - url: str, - dest: str, - hashes: Dict = {}, - filename: Optional[str] = None, - auth: Optional[Tuple[str, str]] = None, -) -> Tuple[str, Dict]: - """Fake download which raises HashCollision (for the sake of test simpliciy, - let's accept that makes sense) - - For tests purpose only. - - """ - if url == "https://example.com/file.txt": - # instead of failing because it's a file not dealt with by the nix guix - # loader, make it raise a hash collision - raise HashCollision("sha1", "f92d74e3874587aaf443d1db961d4e26dde13e9c", []) - return download(url, dest, hashes, filename, auth) - - -def test_raise_exception(swh_storage, requests_mock_datadir, mocker): - mock_download = mocker.patch("swh.loader.package.loader.download") - mock_download.side_effect = fake_download - - loader = NixGuixLoader(swh_storage, sources_url) - res = loader.load() - - assert res == { - "status": "eventful", - "snapshot_id": SNAPSHOT1.id.hex(), - } - - # The visit is partial because some artifact downloads failed - assert_last_visit_matches( - swh_storage, - sources_url, - status="partial", - type="nixguix", - snapshot=SNAPSHOT1.id, - ) - - check_snapshot(SNAPSHOT1, storage=swh_storage) - - assert len(mock_download.mock_calls) == 3 - - -def test_load_nixguix_one_common_artifact_from_other_loader( - swh_storage, datadir, requests_mock_datadir_visits, caplog -): - """Misformatted revision should be caught and logged, then loading continues""" - caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader") - - # 1. first ingest with for example the archive loader - gnu_url = "https://ftp.gnu.org/gnu/8sync/" - release = "0.1.0" - artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" - gnu_artifacts = [ - { - "time": 944729610, - "url": artifact_url, - "length": 221837, - "filename": f"8sync-{release}.tar.gz", - "version": release, - } - ] - archive_loader = ArchiveLoader(swh_storage, url=gnu_url, artifacts=gnu_artifacts) - actual_load_status = archive_loader.load() - expected_snapshot_id = "9efecc835e8f99254934f256b5301b94f348fd17" - assert actual_load_status["status"] == "eventful" - assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa - - assert_last_visit_matches( - archive_loader.storage, - gnu_url, - status="full", - type="tar", - snapshot=hash_to_bytes(expected_snapshot_id), - ) - - # 2. Then ingest with the nixguix loader which lists the same artifact within its - # sources.json - - # ensure test setup is ok - data_sources = os.path.join( - datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" - ) - all_sources = json.loads(open(data_sources).read()) - found = False - for source in all_sources["sources"]: - if source["urls"][0] == artifact_url: - found = True - assert ( - found is True - ), f"test setup error: {artifact_url} must be in {data_sources}" - - # first visit with a snapshot, ok - sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" - loader = NixGuixLoader(swh_storage, sources_url) - actual_load_status2 = loader.load() - assert actual_load_status2["status"] == "eventful" - - snapshot_id = actual_load_status2["snapshot_id"] - - assert_last_visit_matches( - swh_storage, - sources_url, - status="full", - type="nixguix", - snapshot=hash_to_bytes(snapshot_id), - ) - - snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id)) - assert snapshot diff --git a/swh/loader/package/nixguix/tests/test_tasks.py b/swh/loader/package/nixguix/tests/test_tasks.py deleted file mode 100644 index 63db9db6dc828f899dadf05d4e168b321bef7237..0000000000000000000000000000000000000000 --- a/swh/loader/package/nixguix/tests/test_tasks.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (C) 2020-2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import uuid - -import pytest - -from swh.scheduler.model import ListedOrigin, Lister - -NAMESPACE = "swh.loader.package.nixguix" - - -@pytest.fixture -def nixguix_lister(): - return Lister(name="nixguix", instance_name="example", id=uuid.uuid4()) - - -@pytest.fixture -def nixguix_listed_origin(nixguix_lister): - return ListedOrigin( - lister_id=nixguix_lister.id, - url="https://nixguix.example.org/", - visit_type="nixguix", - ) - - -def test_nixguix_loader_task_for_listed_origin( - loading_task_creation_for_listed_origin_test, - nixguix_lister, - nixguix_listed_origin, -): - loading_task_creation_for_listed_origin_test( - loader_class_name=f"{NAMESPACE}.loader.NixGuixLoader", - task_function_name=f"{NAMESPACE}.tasks.LoadNixguix", - lister=nixguix_lister, - listed_origin=nixguix_listed_origin, - )