From 0f7a1cbecaecdfae124c749e76a78025b4088260 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Wed, 6 Jul 2022 11:37:06 +0200 Subject: [PATCH] model: Add Directory.from_possibly_duplicated_entries factory It will be used by swh.storage.backfiller (so indirectly, swh.scrubber) to load directories from the postgresql database, whose schema accidentally allowed directories with duplicate entries -- without corrupting the shape of the directory too much. --- swh/model/model.py | 93 ++++++++++++++++++++++- swh/model/tests/test_model.py | 137 ++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 4 deletions(-) diff --git a/swh/model/model.py b/swh/model/model.py index 508d41cb..1073cc61 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -16,10 +16,11 @@ method to convert between them and msgpack-serializable objects. """ from abc import ABCMeta, abstractmethod +import collections import datetime from enum import Enum import hashlib -from typing import Any, Dict, Iterable, Optional, Tuple, Type, TypeVar, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union import attr from attrs_strict import AttributeTypeError @@ -29,7 +30,7 @@ from typing_extensions import Final from . import git_objects from .collections import ImmutableDict -from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex +from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_hex from .swhids import CoreSWHID from .swhids import ExtendedObjectType as SwhidExtendedObjectType from .swhids import ExtendedSWHID @@ -266,7 +267,7 @@ class HashableObjectWithManifest(HashableObject): attribute is set to an empty value. """ if self.raw_manifest is None: - return super().compute_hash() + return super().compute_hash() # calls self._compute_hash_from_attributes() else: return _compute_hash_from_manifest(self.raw_manifest) @@ -943,12 +944,15 @@ class Revision(HashableObjectWithManifest, BaseModel): ) +_DIR_ENTRY_TYPES = ["file", "dir", "rev"] + + @attr.s(frozen=True, slots=True) class DirectoryEntry(BaseModel): object_type: Final = "directory_entry" name = attr.ib(type=bytes, validator=type_validator()) - type = attr.ib(type=str, validator=attr.validators.in_(["file", "dir", "rev"])) + type = attr.ib(type=str, validator=attr.validators.in_(_DIR_ENTRY_TYPES)) target = attr.ib(type=Sha1Git, validator=type_validator(), repr=hash_repr) perms = attr.ib(type=int, validator=type_validator(), converter=int, repr=oct) """Usually one of the values of `swh.model.from_disk.DentryPerms`.""" @@ -996,6 +1000,87 @@ class Directory(HashableObjectWithManifest, BaseModel): """Returns a SWHID representing this object.""" return CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=self.id) + @classmethod + def from_possibly_duplicated_entries( + cls, + *, + entries: Tuple[DirectoryEntry, ...], + id: Sha1Git = b"", + raw_manifest: Optional[bytes] = None, + ) -> Tuple[bool, "Directory"]: + """Constructs a ``Directory`` object from a list of entries that may contain + duplicated names. + + This is required to represent legacy objects, that were ingested in the + storage database before this check was added. + + As it is impossible for a ``Directory`` instances to have more than one entry + with a given names, this function computes a ``raw_manifest`` and renames one of + the entries before constructing the ``Directory``. + + Returns: + ``(is_corrupt, directory)`` where ``is_corrupt`` is True iff some + entry names were indeed duplicated + """ + # First, try building a Directory object normally without any extra computation, + # which works the overwhelming majority of the time: + try: + return (False, Directory(entries=entries, id=id, raw_manifest=raw_manifest)) + except ValueError: + pass + + # If it fails: + # 1. compute a raw_manifest if there isn't already one: + if raw_manifest is None: + # invalid_directory behaves like a Directory object, but without the + # duplicated entry check; which allows computing its raw_manifest + invalid_directory = type("", (), {})() + invalid_directory.entries = entries + raw_manifest = git_objects.directory_git_object(invalid_directory) + + # 2. look for duplicated entries: + entries_by_name: Dict[ + bytes, Dict[str, List[DirectoryEntry]] + ] = collections.defaultdict(lambda: collections.defaultdict(list)) + for entry in entries: + entries_by_name[entry.name][entry.type].append(entry) + + # 3. strip duplicates + deduplicated_entries = [] + for entry_lists in entries_by_name.values(): + # We could pick one entry at random to keep the original name; but we try to + # "minimize" the impact, by preserving entries of type "rev" first + # (because renaming them would likely break git submodules entirely + # when this directory is written to disk), + # then entries of type "dir" (because renaming them affects the path + # of every file in the dir, instead of just one "cnt"). + dir_entry_types = ("rev", "dir", "file") + assert set(dir_entry_types) == set(_DIR_ENTRY_TYPES) + picked_winner = False # when True, all future entries must be renamed + for type_ in dir_entry_types: + for entry in entry_lists[type_]: + if not picked_winner: + # this is the "most important" entry according to this + # heuristic; it gets to keep its name. + deduplicated_entries.append(entry) + picked_winner = True + else: + # the heuristic already found an entry more important than + # this one; so this one must be renamed to something. + # we pick the beginning of its hash, it should be good enough + # to avoid any conflict. + new_name = ( + entry.name + b"_" + hash_to_bytehex(entry.target)[0:10] + ) + renamed_entry = attr.evolve(entry, name=new_name) + deduplicated_entries.append(renamed_entry) + + # Finally, return the "fixed" the directory + dir_ = Directory( + entries=tuple(deduplicated_entries), id=id, raw_manifest=raw_manifest + ) + return (True, dir_) + @attr.s(frozen=True, slots=True) class BaseContent(BaseModel): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 590e4b4a..0172386a 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -943,6 +943,143 @@ def test_directory_duplicate_entry_name(): Directory(entries=entries) +@given(strategies.directories()) +def test_directory_from_possibly_duplicated_entries__no_duplicates(directory): + """ + Directory.from_possibly_duplicated_entries should return the directory + unchanged if it has no duplicated entry name. + """ + assert (False, directory) == Directory.from_possibly_duplicated_entries( + id=directory.id, entries=directory.entries, raw_manifest=directory.raw_manifest + ) + assert (False, directory) == Directory.from_possibly_duplicated_entries( + entries=directory.entries, raw_manifest=directory.raw_manifest + ) + + +@pytest.mark.parametrize("rev_first", [True, False]) +def test_directory_from_possibly_duplicated_entries__rev_and_dir(rev_first): + entries = ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + ) + if rev_first: + entries = tuple(reversed(entries)) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + DirectoryEntry( + name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1 + ), + ) + + # order is independent of 'rev_first' because it is always sorted in git order + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"0 foo\x00" + b"\x00" * 20 + + b"1 foo\x00" + b"\x01" * 20 + # fmt: on + ) + + +@pytest.mark.parametrize("file_first", [True, False]) +def test_directory_from_possibly_duplicated_entries__file_and_dir(file_first): + entries = ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + ) + if file_first: + entries = tuple(reversed(entries)) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry( + name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0 + ), + ) + + # order is independent of 'file_first' because it is always sorted in git order + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"0 foo\x00" + b"\x00" * 20 + + b"1 foo\x00" + b"\x01" * 20 + # fmt: on + ) + + +def test_directory_from_possibly_duplicated_entries__two_files1(): + entries = ( + DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + ) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1), + DirectoryEntry( + name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0 + ), + ) + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"1 foo\x00" + b"\x01" * 20 + + b"0 foo\x00" + b"\x00" * 20 + # fmt: on + ) + + +def test_directory_from_possibly_duplicated_entries__two_files2(): + """ + Same as above, but entries are in a different order (and order matters + to break the tie) + """ + entries = ( + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1), + ) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries) + assert is_corrupt + + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0), + DirectoryEntry( + name=b"foo_0101010101", type="file", target=b"\x01" * 20, perms=1 + ), + ) + assert dir_.raw_manifest == ( + # fmt: off + b"tree 52\x00" + + b"0 foo\x00" + b"\x00" * 20 + + b"1 foo\x00" + b"\x01" * 20 + # fmt: on + ) + + +def test_directory_from_possibly_duplicated_entries__preserve_manifest(): + entries = ( + DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1), + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + ) + (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries( + entries=entries, raw_manifest=b"blah" + ) + assert is_corrupt + assert dir_.entries == ( + DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0), + DirectoryEntry( + name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1 + ), + ) + + assert dir_.raw_manifest == b"blah" + + # Release -- GitLab