diff --git a/PKG-INFO b/PKG-INFO index 79e4bad8ae870e5d7a81a0f734e15a1a787d471b..d66225165474a4c06cbbffe272b2e664aab76e53 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.8.0 +Version: 6.9.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers @@ -16,11 +16,38 @@ Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown +License-File: LICENSE +License-File: AUTHORS +Requires-Dist: attrs!=21.1.0 +Requires-Dist: attrs_strict>=0.0.7 +Requires-Dist: deprecated +Requires-Dist: hypothesis +Requires-Dist: iso8601 +Requires-Dist: python-dateutil +Requires-Dist: typing_extensions Provides-Extra: cli +Requires-Dist: swh.core>=0.3; extra == "cli" +Requires-Dist: Click; extra == "cli" +Requires-Dist: dulwich; extra == "cli" Provides-Extra: testing-minimal +Requires-Dist: aiohttp; extra == "testing-minimal" +Requires-Dist: click; extra == "testing-minimal" +Requires-Dist: pytest; extra == "testing-minimal" +Requires-Dist: pytz; extra == "testing-minimal" +Requires-Dist: types-click; extra == "testing-minimal" +Requires-Dist: types-python-dateutil; extra == "testing-minimal" +Requires-Dist: types-pytz; extra == "testing-minimal" Provides-Extra: testing -License-File: LICENSE -License-File: AUTHORS +Requires-Dist: aiohttp; extra == "testing" +Requires-Dist: click; extra == "testing" +Requires-Dist: pytest; extra == "testing" +Requires-Dist: pytz; extra == "testing" +Requires-Dist: types-click; extra == "testing" +Requires-Dist: types-python-dateutil; extra == "testing" +Requires-Dist: types-pytz; extra == "testing" +Requires-Dist: swh.core>=0.3; extra == "testing" +Requires-Dist: Click; extra == "testing" +Requires-Dist: dulwich; extra == "testing" swh-model ========= diff --git a/bin/swh-hashtree b/bin/swh-hashtree index a4f8d7b70b303bf55159b7c44c895a293f9407ec..da5249af49122343c7db1985443fe6f4cb99bae0 100755 --- a/bin/swh-hashtree +++ b/bin/swh-hashtree @@ -15,7 +15,7 @@ from swh.model import from_disk, hashutil def combine_filters(*filters): """Combine several ignore filters""" if len(filters) == 0: - return from_disk.accept_all_directories + return from_disk.accept_all_paths elif len(filters) == 1: return filters[0] @@ -32,7 +32,6 @@ def combine_filters(*filters): ) @click.option("--ignore", multiple=True, help="Ignore pattern.") def main(path, ignore_empty_folder=False, ignore=None): - filters = [] if ignore_empty_folder: filters.append(from_disk.ignore_empty_directories) @@ -43,7 +42,7 @@ def main(path, ignore_empty_folder=False, ignore=None): try: d = from_disk.Directory.from_disk( - path=os.fsencode(path), dir_filter=combine_filters(*filters) + path=os.fsencode(path), path_filter=combine_filters(*filters) ) hash = d.hash except Exception as e: diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 79e4bad8ae870e5d7a81a0f734e15a1a787d471b..d66225165474a4c06cbbffe272b2e664aab76e53 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 6.8.0 +Version: 6.9.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers @@ -16,11 +16,38 @@ Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown +License-File: LICENSE +License-File: AUTHORS +Requires-Dist: attrs!=21.1.0 +Requires-Dist: attrs_strict>=0.0.7 +Requires-Dist: deprecated +Requires-Dist: hypothesis +Requires-Dist: iso8601 +Requires-Dist: python-dateutil +Requires-Dist: typing_extensions Provides-Extra: cli +Requires-Dist: swh.core>=0.3; extra == "cli" +Requires-Dist: Click; extra == "cli" +Requires-Dist: dulwich; extra == "cli" Provides-Extra: testing-minimal +Requires-Dist: aiohttp; extra == "testing-minimal" +Requires-Dist: click; extra == "testing-minimal" +Requires-Dist: pytest; extra == "testing-minimal" +Requires-Dist: pytz; extra == "testing-minimal" +Requires-Dist: types-click; extra == "testing-minimal" +Requires-Dist: types-python-dateutil; extra == "testing-minimal" +Requires-Dist: types-pytz; extra == "testing-minimal" Provides-Extra: testing -License-File: LICENSE -License-File: AUTHORS +Requires-Dist: aiohttp; extra == "testing" +Requires-Dist: click; extra == "testing" +Requires-Dist: pytest; extra == "testing" +Requires-Dist: pytz; extra == "testing" +Requires-Dist: types-click; extra == "testing" +Requires-Dist: types-python-dateutil; extra == "testing" +Requires-Dist: types-pytz; extra == "testing" +Requires-Dist: swh.core>=0.3; extra == "testing" +Requires-Dist: Click; extra == "testing" +Requires-Dist: dulwich; extra == "testing" swh-model ========= diff --git a/swh/model/cli.py b/swh/model/cli.py index abcabfe17ae2215ace4e8604fed3d19f8cab5785..6508602110672049403ed507b5274549527d14dc 100644 --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -70,17 +70,18 @@ def swhid_of_file_content(data) -> CoreSWHID: def model_of_dir( - path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None + path: bytes, + exclude_patterns: Optional[Iterable[bytes]] = None, ) -> Directory: - from swh.model.from_disk import accept_all_directories, ignore_directories_patterns + from swh.model.from_disk import accept_all_paths, ignore_directories_patterns - dir_filter = ( + path_filter = ( ignore_directories_patterns(path, exclude_patterns) if exclude_patterns - else accept_all_directories + else accept_all_paths ) - return Directory.from_disk(path=path, dir_filter=dir_filter) + return Directory.from_disk(path=path, path_filter=path_filter) def swhid_of_dir( diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py index 058e77fa2b39e1293b404c91f1747049add5bfb6..798851a23514c85a44cca0dd6c816153c6ae7485 100644 --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -17,7 +17,18 @@ import glob import os import re import stat -from typing import Any, Iterable, Iterator, List, Optional, Pattern, Tuple +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Pattern, + Tuple, +) +import warnings import attr from attrs_strict import type_validator @@ -239,7 +250,9 @@ class Content(MerkleLeaf): return DiskBackedContent.from_dict(data) -def accept_all_directories(dirpath: str, dirname: str, entries: Iterable[Any]) -> bool: +def accept_all_directories( + dirpath: bytes, dirname: bytes, entries: Optional[Iterable[Any]] +) -> bool: """Default filter for :func:`Directory.from_disk` accepting all directories @@ -247,11 +260,22 @@ def accept_all_directories(dirpath: str, dirname: str, entries: Iterable[Any]) - dirname (bytes): directory name entries (list): directory entries """ + warnings.warn( + "`accept_all_directories` is deprecated, use `accept_all_paths`", + DeprecationWarning, + ) + return True + + +def accept_all_paths( + path: bytes, name: bytes, entries: Optional[Iterable[Any]] +) -> bool: + """Default filter for :func:`Directory.from_disk` accepting all paths""" return True def ignore_empty_directories( - dirpath: str, dirname: str, entries: Iterable[Any] + dirpath: bytes, dirname: bytes, entries: Optional[Iterable[Any]] ) -> bool: """Filter for :func:`directory_to_objects` ignoring empty directories @@ -261,6 +285,9 @@ def ignore_empty_directories( Returns: True if the directory is not empty, false if the directory is empty """ + if entries is None: + # Files are not ignored + return True return bool(entries) @@ -285,6 +312,9 @@ def ignore_named_directories(names, *, case_sensitive=True): names: Iterable[Any] = names, case_sensitive: bool = case_sensitive, ): + if entries is None: + # Files are not ignored + return True if case_sensitive: return dirname not in names else: @@ -413,23 +443,43 @@ class Directory(MerkleNode): @classmethod def from_disk( - cls, *, path, dir_filter=accept_all_directories, max_content_length=None - ): + cls, + *, + path: bytes, + path_filter: Callable[ + [bytes, bytes, Optional[List[bytes]]], bool + ] = accept_all_paths, + dir_filter: Optional[ + Callable[[bytes, bytes, Optional[List[bytes]]], bool] + ] = None, + max_content_length: Optional[int] = None, + ) -> "Directory": """Compute the Software Heritage objects for a given directory tree Args: path (bytes): the directory to traverse data (bool): whether to add the data to the content objects save_path (bool): whether to add the path to the content objects - dir_filter (function): a filter to ignore some directories by - name or contents. Takes two arguments: dirname and entries, and + path_filter (function): a filter to ignore some paths. + Takes three arguments: `path`, `name` and `entries`. + `entries` is `None` for files, and a (possibly empty) list of names + for directories. + Returns True if the path should be added, False if the + path should be ignored. + dir_filter (DEPRECATED, function): a filter to ignore some directories + by name or contents. Takes two arguments: dirname and entries, and returns True if the directory should be added, False if the directory should be ignored. max_content_length (Optional[int]): if given, all contents larger than this will be skipped. """ top_path = path - dirs = {} + dirs: Dict[bytes, Directory] = {} + if dir_filter is not None: + warnings.warn( + "`dir_filter` is deprecated. Use `path_filter` instead", + DeprecationWarning, + ) for root, dentries, fentries in os.walk(top_path, topdown=False): entries = {} @@ -438,12 +488,17 @@ class Directory(MerkleNode): for name in fentries + dentries: path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): + if not path_filter(path, name, None): + continue content = Content.from_file( path=path, max_content_length=max_content_length ) entries[name] = content else: - if dir_filter(path, name, dirs[path].entries): + if dir_filter is not None: + if dir_filter(path, name, dirs[path].entries): + entries[name] = dirs[path] + elif path_filter(path, name, dirs[path].entries): entries[name] = dirs[path] dirs[root] = cls({"name": os.path.basename(root), "path": root}) diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index 19a8d89118f1e6b522b2f8af61ff6a1337515e38..45671861f8c8a6f1215bc1fc758505d9b7722845 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -56,7 +56,8 @@ from .model import ( from .swhids import ExtendedObjectType, ExtendedSWHID pgsql_alphabet = characters( - blacklist_categories=("Cs",), blacklist_characters=["\u0000"] + blacklist_categories=["Cs"], + blacklist_characters=["\u0000"], ) # postgresql does not like these diff --git a/swh/model/model.py b/swh/model/model.py index 03b35d6cf06370db7e359b35f69e3cfa4d21949c..c919df0d7296afa7485b209b420992489f7f25e6 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -66,6 +66,10 @@ def hash_repr(h: bytes) -> str: return f"hash_to_bytes('{hash_to_hex(h)}')" +def parents_repr(parents: Tuple[Sha1Git, ...]): + return repr(tuple(hash_repr(p) for p in parents)).replace('"', "") + + def freeze_optional_dict( d: Union[None, Dict, ImmutableDict] ) -> Optional[ImmutableDict]: @@ -1079,7 +1083,10 @@ class Revision(HashableObjectWithManifest, BaseModel): default=None, ) parents = attr.ib( - type=Tuple[Sha1Git, ...], validator=generic_type_validator, default=() + type=Tuple[Sha1Git, ...], + validator=generic_type_validator, + default=(), + repr=parents_repr, ) id = attr.ib( type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py index 1ebcbbe981f8e53b73a2e3913bfd9b6b537c15ec..3ab7ba242b836c16934175d3bbd58555721b6755 100644 --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -768,7 +768,7 @@ class DirectoryToObjects(DataMixin, unittest.TestCase): def test_directory_to_objects_ignore_empty(self): directory = Directory.from_disk( - path=self.tmpdir_name, dir_filter=from_disk.ignore_empty_directories + path=self.tmpdir_name, path_filter=from_disk.ignore_empty_directories ) for name, value in self.contents.items(): @@ -798,7 +798,7 @@ class DirectoryToObjects(DataMixin, unittest.TestCase): def test_directory_to_objects_ignore_name(self): directory = Directory.from_disk( path=self.tmpdir_name, - dir_filter=from_disk.ignore_named_directories([b"symlinks"]), + path_filter=from_disk.ignore_named_directories([b"symlinks"]), ) for name, value in self.contents.items(): self.assertContentEqual(directory[b"contents/" + name], value) @@ -826,7 +826,7 @@ class DirectoryToObjects(DataMixin, unittest.TestCase): def test_directory_to_objects_ignore_name_case(self): directory = Directory.from_disk( path=self.tmpdir_name, - dir_filter=from_disk.ignore_named_directories( + path_filter=from_disk.ignore_named_directories( [b"symLiNks"], case_sensitive=False ), ) @@ -868,6 +868,52 @@ class DirectoryToObjects(DataMixin, unittest.TestCase): b"foo0", ] + def test_directory_path_filter(self): + def filter_func(path, name, entries): + return name.startswith(b"foo") + + with tempfile.TemporaryDirectory() as dirname: + dirname = os.fsencode(dirname) + open(os.path.join(dirname, b"foofile"), "a") + open(os.path.join(dirname, b"file"), "a") + os.mkdir(os.path.join(dirname, b"foo")) + os.mkdir(os.path.join(dirname, b"baz")) + + # No filters + directory = Directory.from_disk(path=dirname) + assert [entry["name"] for entry in directory.entries] == [ + b"baz", + b"file", + b"foo", + b"foofile", + ] + + # Filter paths + directory = Directory.from_disk(path=dirname, path_filter=filter_func) + assert [entry["name"] for entry in directory.entries] == [ + b"foo", + b"foofile", + ] + + # Filter directories and paths (`path_filter` should take precedence) + with pytest.deprecated_call(): + directory = Directory.from_disk( + path=dirname, path_filter=filter_func, dir_filter=filter_func + ) + assert [entry["name"] for entry in directory.entries] == [ + b"foo", + b"foofile", + ] + + # Test deprecated way + with pytest.deprecated_call(): + directory = Directory.from_disk(path=dirname, dir_filter=filter_func) + assert [entry["name"] for entry in directory.entries] == [ + b"file", + b"foo", + b"foofile", + ] + @pytest.mark.fs class TarballTest(DataMixin, unittest.TestCase): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 920c8dd5c91799e5c8bc572f897586a575a13b7d..f2adc62941d98bff9752f10f151bbb1ba87c796d 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -1461,7 +1461,7 @@ def test_revision_directory_swhid(): def test_revision_parent_swhids(): - revision_d = revision_example.copy() + revision_d = copy.deepcopy(revision_example) revision_d["parents"].append( hash_to_bytes("b2a7e1260492e344fab3cbf91bc13c91e05426fd") ) @@ -1954,3 +1954,29 @@ def test_metadata_normalize_discovery_date(): assert md.discovery_date == truncated_date assert md.discovery_date.tzinfo == datetime.timezone.utc + + +def test_revision_repr(): + from swh.model.model import RevisionType # noqa + + revision = Revision.from_dict(revision_example) + rev_repr = repr(revision) + + assert rev_repr == ( + "Revision(message=b'Linux 4.2-rc2\\n', " + "author=Person(fullname=b'Linus Torvalds <torvalds@linux-foundation.org>', " + "name=b'Linus Torvalds', email=b'torvalds@linux-foundation.org'), " + "committer=Person(fullname=b'Linus Torvalds <torvalds@linux-foundation.org>', " + "name=b'Linus Torvalds', email=b'torvalds@linux-foundation.org'), " + "date=TimestampWithTimezone(timestamp=Timestamp(seconds=1436739030, microseconds=0), " + "offset_bytes=b'-0700'), " + "committer_date=TimestampWithTimezone(timestamp=Timestamp(seconds=1436739030, " + "microseconds=0), offset_bytes=b'-0700'), " + "type=RevisionType.GIT, " + "directory=hash_to_bytes('85a74718d377195e1efd0843ba4f3260bad4fe07'), " + "synthetic=False, metadata=None, " + "parents=(hash_to_bytes('01e2d0627a9a6edb24c37db45db5ecb31e9de808'),), " + "id=hash_to_bytes('bc0195aad0daa2ad5b0d76cce22b167bc3435590'), " + "extra_headers=(), raw_manifest=None)" + ) + assert eval(rev_repr) == revision