Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-loader-git
  • lunar/swh-loader-git
  • ardumont/swh-loader-git
  • stsp/swh-loader-git
  • swh/devel/swh-loader-git
  • douardda/swh-loader-git
  • olasd/swh-loader-git
  • marmoute/swh-loader-git
  • rboyer/swh-loader-git
9 results
Show changes
Commits on Source (22)
Showing
with 371 additions and 228 deletions
# Changes here will be overwritten by Copier
_commit: v0.2.0
_commit: v0.3.3
_src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
description: Software Heritage git loader
distribution_name: swh-loader-git
......
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-json
- id: check-yaml
- repo: https://github.com/python/black
rev: 23.1.0
rev: 25.1.0
hooks:
- id: black
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 6.0.0
hooks:
- id: isort
- repo: https://github.com/pycqa/flake8
rev: 6.0.0
rev: 7.1.1
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear==22.9.23]
additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
rev: v2.4.1
hooks:
- id: codespell
name: Check source code spelling
stages: [commit]
stages: [pre-commit]
- id: codespell
name: Check commit message spelling
stages: [commit-msg]
......
......@@ -6,7 +6,7 @@ In the interest of fostering an open and welcoming environment, we as Software
Heritage contributors and maintainers pledge to making participation in our
project and our community a harassment-free experience for everyone, regardless
of age, body size, disability, ethnicity, sex characteristics, gender identity
and expression, level of experience, education, socio-economic status,
and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, religion, or sexual identity and
orientation.
......
[mypy]
namespace_packages = True
warn_unused_ignores = True
explicit_package_bases = True
# ^ Needed for mypy to detect py.typed from swh packages installed
# in editable mode
# 3rd party libraries without stubs (yet)
[mypy-celery.*]
ignore_missing_imports = True
[mypy-dulwich.*]
ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[mypy-swh.loader.*]
ignore_missing_imports = True
......@@ -45,7 +45,7 @@ build-backend = "setuptools.build_meta"
fallback_version = "0.0.1"
[tool.black]
target-version = ['py37']
target-version = ['py39', 'py310', 'py311', 'py312']
[tool.isort]
multi_line_output = 3
......@@ -56,3 +56,35 @@ ensure_newline_before_comments = true
line_length = 88
force_sort_within_sections = true
known_first_party = ['swh']
[tool.mypy]
namespace_packages = true
warn_unused_ignores = true
explicit_package_bases = true
# ^ Needed for mypy to detect py.typed from swh packages installed
# in editable mode
plugins = []
# 3rd party libraries without stubs (yet)
# [[tool.mypy.overrides]]
# module = [
# "package1.*",
# "package2.*",
# ]
# ignore_missing_imports = true
[tool.flake8]
select = ["C", "E", "F", "W", "B950"]
ignore = [
"E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
"E231", # missing whitespace after ','
"E501", # line too long, use B950 warning from flake8-bugbear instead
"W503" # line break before binary operator <https://github.com/psf/black/issues/52>
]
max-line-length = 88
[tool.pytest.ini_options]
norecursedirs = "build docs .*"
asyncio_mode = "strict"
consider_namespace_packages = true
[pytest]
norecursedirs = build docs .*
asyncio_mode = strict
consider_namespace_packages = true
addopts =
-p no:pytest_swh_scheduler
-p no:pytest_swh_storage
# Drop this when these fixtures aren't imported automatically
markers =
fs: depends on writing to the filesystem
swh.core >= 2.22.0
swh.loader.core >= 5.14.2
swh.model >= 6.9.0
swh.loader.core >= 5.18.3
swh.model >= 7.1.0
swh.scheduler >= 0.0.39
swh.storage >= 0.22.0
swh.storage >= 2.4.1
celery-types
pytest >= 8.1
pytest-mock
requests_mock
swh.scheduler[testing] >= 0.5.0
swh.storage[testing]
swh.loader.core[testing] >= 5.18.1
swh.scheduler[pytest] >= 3.1.0
swh.storage[pytest] >= 3.1.0
types-Deprecated
types-click
types-urllib3
[flake8]
# E203: whitespaces before ':' <https://github.com/psf/black/issues/315>
# E231: missing whitespace after ','
# E501: line too long, use B950 warning from flake8-bugbear instead
# W503: line break before binary operator <https://github.com/psf/black/issues/52>
select = C,E,F,W,B950
ignore = E203,E231,E501,W503
max-line-length = 88
# Copyright (C) 2015-2022 The Software Heritage developers
# Copyright (C) 2015-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -31,8 +31,9 @@ from swh.model.model import (
Revision,
RevisionType,
SkippedContent,
TargetType,
SnapshotTargetType,
Timestamp,
TimestampOverflowException,
TimestampWithTimezone,
)
......@@ -63,7 +64,7 @@ def check_id(obj: HashableObject) -> None:
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
"""Convert a dulwich blob to a Software Heritage content id"""
if obj.type_name != b"blob":
if obj.type_name != Blob.type_name:
raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)
......@@ -81,7 +82,7 @@ def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
"""Convert a dulwich blob to a Software Heritage content"""
if obj.type_name != b"blob":
if obj.type_name != Blob.type_name:
raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)
......@@ -102,7 +103,7 @@ def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
def dulwich_tree_to_directory(obj: ShaFile) -> Directory:
"""Format a tree as a directory"""
if obj.type_name != b"tree":
if obj.type_name != Tree.type_name:
raise ValueError("Argument is not a tree.")
tree = cast(Tree, obj)
......@@ -161,11 +162,13 @@ def dulwich_tsinfo_to_timestamp(
timezone_bytes: Optional[bytes],
) -> TimestampWithTimezone:
"""Convert the dulwich timestamp information to a structure compatible with
Software Heritage."""
ts = Timestamp(
seconds=int(timestamp),
microseconds=0,
)
Software Heritage.
Returns epoch if the timestamp overflows :class:`Timestamp`."""
try:
ts = Timestamp(seconds=int(timestamp), microseconds=0)
except TimestampOverflowException:
ts = Timestamp(seconds=0, microseconds=0)
if timezone_bytes is None:
# Failed to parse from the raw manifest, fallback to what Dulwich managed to
# parse.
......@@ -179,7 +182,7 @@ def dulwich_tsinfo_to_timestamp(
def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
if obj.type_name != b"commit":
if obj.type_name != Commit.type_name:
raise ValueError("Argument is not a commit.")
commit = cast(Commit, obj)
......@@ -207,8 +210,8 @@ def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
assert raw_string.endswith(b"\n")
extra_headers.append((b"mergetag", raw_string[:-1]))
if commit.extra:
extra_headers.extend((k, v) for k, v in commit.extra)
if commit._extra:
extra_headers.extend((k, v) for k, v in commit._extra)
if commit.gpgsig:
extra_headers.append((b"gpgsig", commit.gpgsig))
......@@ -219,14 +222,14 @@ def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
date=dulwich_tsinfo_to_timestamp(
commit.author_time,
commit.author_timezone,
commit._author_timezone_neg_utc,
bool(commit._author_timezone_neg_utc),
author_timezone,
),
committer=parse_author(commit.committer),
committer_date=dulwich_tsinfo_to_timestamp(
commit.commit_time,
commit.commit_timezone,
commit._commit_timezone_neg_utc,
bool(commit._commit_timezone_neg_utc),
committer_timezone,
),
type=RevisionType.GIT,
......@@ -256,23 +259,23 @@ def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
DULWICH_TARGET_TYPES = {
b"blob": TargetType.CONTENT,
b"tree": TargetType.DIRECTORY,
b"commit": TargetType.REVISION,
b"tag": TargetType.RELEASE,
Blob.type_name: SnapshotTargetType.CONTENT,
Tree.type_name: SnapshotTargetType.DIRECTORY,
Commit.type_name: SnapshotTargetType.REVISION,
Tag.type_name: SnapshotTargetType.RELEASE,
}
DULWICH_OBJECT_TYPES = {
b"blob": ObjectType.CONTENT,
b"tree": ObjectType.DIRECTORY,
b"commit": ObjectType.REVISION,
b"tag": ObjectType.RELEASE,
Blob.type_name: ObjectType.CONTENT,
Tree.type_name: ObjectType.DIRECTORY,
Commit.type_name: ObjectType.REVISION,
Tag.type_name: ObjectType.RELEASE,
}
def dulwich_tag_to_release(obj: ShaFile) -> Release:
if obj.type_name != b"tag":
if obj.type_name != Tag.type_name:
raise ValueError("Argument is not a tag.")
tag = cast(Tag, obj)
......
......@@ -15,7 +15,7 @@ from swh.loader.core.loader import BaseDirectoryLoader
from swh.loader.exception import NotFound
from swh.loader.git.utils import raise_not_found_repository
from swh.model.from_disk import ignore_empty_directories, ignore_named_directories
from swh.model.model import Snapshot, SnapshotBranch, TargetType
from swh.model.model import Snapshot, SnapshotBranch, SnapshotTargetType
def git() -> str:
......@@ -107,7 +107,7 @@ class GitCheckoutLoader(BaseDirectoryLoader):
self.git_ref = kwargs.pop("ref")
self.submodules = kwargs.pop("submodules", False)
# We use a filter which ignore the .git folder and the empty git trees
super().__init__(*args, dir_filter=list_git_tree, **kwargs)
super().__init__(*args, path_filter=list_git_tree, **kwargs)
def fetch_artifact(self) -> Iterator[Path]:
with raise_not_found_repository():
......@@ -143,12 +143,12 @@ class GitCheckoutLoader(BaseDirectoryLoader):
return Snapshot(
branches={
b"HEAD": SnapshotBranch(
target_type=TargetType.ALIAS,
target_type=SnapshotTargetType.ALIAS,
target=branch_name,
),
branch_name: SnapshotBranch(
target=self.directory.hash,
target_type=TargetType.DIRECTORY,
target_type=SnapshotTargetType.DIRECTORY,
),
}
)
......@@ -25,7 +25,7 @@ from typing import (
import urllib.parse
from dulwich.errors import NotGitRepository
from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree
from dulwich.objects import S_IFGITLINK, Blob, Commit, ShaFile, Tree
from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file
import requests
from tenacity.before_sleep import before_sleep_log
......@@ -42,7 +42,7 @@ fetch_pack_logger = logger.getChild("fetch_pack")
class BytesWriter(Protocol):
def write(self, data: bytes):
...
pass
def requests_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
......@@ -128,11 +128,9 @@ class GitObjectsFetcher:
commit_objects = []
for ref in wants:
ref_object = self._get_git_object(ref)
if ref_object.type_num == Commit.type_num:
self.objects[ref_object.type_name].add(ref)
if ref_object.type_name == Commit.type_name:
commit_objects.append(cast(Commit, ref_object))
self.objects[b"commit"].add(ref)
else:
self.objects[b"tag"].add(ref)
# perform DFS on commits graph
while commit_objects:
......@@ -142,18 +140,19 @@ class GitObjectsFetcher:
for parent in commit.parents:
if (
# commit not already seen in the current load
parent not in self.objects[b"commit"]
parent not in self.objects[Commit.type_name]
# commit not already archived by a previous load
and parent not in self.base_repo.local_heads
):
commit_objects.append(cast(Commit, self._get_git_object(parent)))
self.objects[b"commit"].add(parent)
self.objects[Commit.type_name].add(parent)
def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]:
"""Returns a generator on fetched git objects per type.
Args:
object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"
object_type: Git object type, either Blob.type_name, Commit.type_name,
Tag.type_name or Tree.type_name
Returns:
A generator fetching git objects on the fly.
......@@ -258,9 +257,9 @@ class GitObjectsFetcher:
return ShaFile.from_file(self._http_get(object_path))
def _fetch_tree_objects(self, sha: bytes) -> None:
if sha not in self.objects[b"tree"]:
if sha not in self.objects[Tree.type_name]:
tree = cast(Tree, self._get_git_object(sha))
self.objects[b"tree"].add(sha)
self.objects[Tree.type_name].add(sha)
for item in tree.items():
if item.mode == S_IFGITLINK:
# skip submodules as objects are not stored in repository
......@@ -268,4 +267,4 @@ class GitObjectsFetcher:
if item.mode & stat.S_IFDIR:
self._fetch_tree_objects(item.sha)
else:
self.objects[b"blob"].add(item.sha)
self.objects[Blob.type_name].add(item.sha)
# Copyright (C) 2015-2023 The Software Heritage developers
# Copyright (C) 2015-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from datetime import datetime
import logging
......@@ -12,12 +13,12 @@ from typing import Dict, Optional
from deprecated import deprecated
from dulwich.errors import ObjectFormatException
import dulwich.objects
from dulwich.objects import EmptyFileException
from dulwich.objects import Blob, Commit, EmptyFileException, Tag, Tree
import dulwich.repo
from swh.loader.git.utils import raise_not_found_repository
from swh.model import hashutil
from swh.model.model import Snapshot, SnapshotBranch, TargetType
from swh.model.model import Snapshot, SnapshotBranch, SnapshotTargetType
from swh.storage.algos.origin import origin_get_latest_visit_status
from swh.storage.interface import StorageInterface
......@@ -221,11 +222,11 @@ class GitLoaderFromDisk(BaseGitLoader):
def has_contents(self):
"""Checks whether we need to load contents"""
return bool(self.type_to_ids[b"blob"])
return bool(self.type_to_ids[Blob.type_name])
def get_content_ids(self):
"""Get the content identifiers from the git repository"""
for oid in self.type_to_ids[b"blob"]:
for oid in self.type_to_ids[Blob.type_name]:
yield converters.dulwich_blob_to_content_id(self.repo[oid])
def get_contents(self):
......@@ -241,11 +242,14 @@ class GitLoaderFromDisk(BaseGitLoader):
def has_directories(self):
"""Checks whether we need to load directories"""
return bool(self.type_to_ids[b"tree"])
return bool(self.type_to_ids[Tree.type_name])
def get_directory_ids(self):
"""Get the directory identifiers from the git repository"""
return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tree"])
return (
hashutil.hash_to_bytes(id.decode())
for id in self.type_to_ids[Tree.type_name]
)
def get_directories(self):
"""Get the directories that need to be loaded"""
......@@ -260,12 +264,13 @@ class GitLoaderFromDisk(BaseGitLoader):
def has_revisions(self):
"""Checks whether we need to load revisions"""
return bool(self.type_to_ids[b"commit"])
return bool(self.type_to_ids[Commit.type_name])
def get_revision_ids(self):
"""Get the revision identifiers from the git repository"""
return (
hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"]
hashutil.hash_to_bytes(id.decode())
for id in self.type_to_ids[Commit.type_name]
)
def get_revisions(self):
......@@ -281,11 +286,14 @@ class GitLoaderFromDisk(BaseGitLoader):
def has_releases(self):
"""Checks whether we need to load releases"""
return bool(self.type_to_ids[b"tag"])
return bool(self.type_to_ids[Tag.type_name])
def get_release_ids(self):
"""Get the release identifiers from the git repository"""
return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"])
return (
hashutil.hash_to_bytes(id.decode())
for id in self.type_to_ids[Tag.type_name]
)
def get_releases(self):
"""Get the releases that need to be loaded"""
......@@ -317,7 +325,9 @@ class GitLoaderFromDisk(BaseGitLoader):
for ref, target in self.repo.refs.get_symrefs().items():
if utils.ignore_branch_name(ref):
continue
branches[ref] = SnapshotBranch(target=target, target_type=TargetType.ALIAS)
branches[ref] = SnapshotBranch(
target=target, target_type=SnapshotTargetType.ALIAS
)
if target not in branches:
# This handles the case where the pointer is "dangling".
# There's a chance that a further symbolic reference will
......
......@@ -50,9 +50,10 @@ from swh.model.model import (
Revision,
Snapshot,
SnapshotBranch,
TargetType,
SnapshotTargetType,
)
from swh.model.swhids import ExtendedObjectType
from swh.objstorage.interface import objid_from_dict
from swh.storage.algos.directory import directory_get
from swh.storage.algos.snapshot import snapshot_get_latest
from swh.storage.interface import StorageInterface
......@@ -108,7 +109,7 @@ class RepoRepresentation:
heads_logger.debug("Heads known in the archive:")
for base_snapshot in self.base_snapshots:
for branch_name, branch in base_snapshot.branches.items():
if not branch or branch.target_type == TargetType.ALIAS:
if not branch or branch.target_type == SnapshotTargetType.ALIAS:
continue
heads_logger.debug(" %r: %s", branch_name, branch.target.hex())
self.local_heads.add(HexBytes(hashutil.hash_to_bytehex(branch.target)))
......@@ -226,7 +227,7 @@ class GitLoader(BaseGitLoader):
# state initialized in fetch_data
self.remote_refs: Dict[bytes, HexBytes] = {}
self.symbolic_refs: Dict[bytes, HexBytes] = {}
self.ref_object_types: Dict[bytes, Optional[TargetType]] = {}
self.ref_object_types: Dict[bytes, Optional[SnapshotTargetType]] = {}
self.ext_refs: Dict[bytes, Optional[Tuple[int, bytes]]] = {}
self.repo_pack_size_bytes = 0
self.urllib3_extra_kwargs = urllib3_extra_kwargs
......@@ -308,7 +309,11 @@ class GitLoader(BaseGitLoader):
)
def get_full_snapshot(self, origin_url) -> Optional[Snapshot]:
return snapshot_get_latest(self.storage, origin_url)
return snapshot_get_latest(
self.storage,
origin_url,
visit_type=self.visit_type,
)
def load_metadata_objects(
self, metadata_objects: List[RawExtrinsicMetadata]
......@@ -545,7 +550,7 @@ class GitLoader(BaseGitLoader):
if cnts and cnts[0] is not None:
cnt = cnts[0]
d = cnt.to_dict()
d["data"] = storage.content_get_data(cnt.sha1)
d["data"] = storage.content_get_data(objid_from_dict(d))
cnt = Content.from_dict(d)
cnt.check()
set_ext_ref(Blob.type_num, content_git_object(cnt), "content")
......@@ -607,9 +612,9 @@ class GitLoader(BaseGitLoader):
def get_contents(self) -> Iterable[BaseContent]:
"""Format the blobs from the git repository as swh contents"""
for raw_obj in self.iter_objects(b"blob"):
for raw_obj in self.iter_objects(Blob.type_name):
if raw_obj.id in self.ref_object_types:
self.ref_object_types[raw_obj.id] = TargetType.CONTENT
self.ref_object_types[raw_obj.id] = SnapshotTargetType.CONTENT
yield converters.dulwich_blob_to_content(
raw_obj, max_content_size=self.max_content_size
......@@ -617,25 +622,25 @@ class GitLoader(BaseGitLoader):
def get_directories(self) -> Iterable[Directory]:
"""Format the trees as swh directories"""
for raw_obj in self.iter_objects(b"tree"):
for raw_obj in self.iter_objects(Tree.type_name):
if raw_obj.id in self.ref_object_types:
self.ref_object_types[raw_obj.id] = TargetType.DIRECTORY
self.ref_object_types[raw_obj.id] = SnapshotTargetType.DIRECTORY
yield converters.dulwich_tree_to_directory(raw_obj)
def get_revisions(self) -> Iterable[Revision]:
"""Format commits as swh revisions"""
for raw_obj in self.iter_objects(b"commit"):
for raw_obj in self.iter_objects(Commit.type_name):
if raw_obj.id in self.ref_object_types:
self.ref_object_types[raw_obj.id] = TargetType.REVISION
self.ref_object_types[raw_obj.id] = SnapshotTargetType.REVISION
yield converters.dulwich_commit_to_revision(raw_obj)
def get_releases(self) -> Iterable[Release]:
"""Retrieve all the release objects from the git repository"""
for raw_obj in self.iter_objects(b"tag"):
for raw_obj in self.iter_objects(Tag.type_name):
if raw_obj.id in self.ref_object_types:
self.ref_object_types[raw_obj.id] = TargetType.RELEASE
self.ref_object_types[raw_obj.id] = SnapshotTargetType.RELEASE
yield converters.dulwich_tag_to_release(raw_obj)
......@@ -679,7 +684,7 @@ class GitLoader(BaseGitLoader):
# Handle symbolic references as alias branches
for ref_name, target in self.symbolic_refs.items():
branches[ref_name] = SnapshotBranch(
target_type=TargetType.ALIAS,
target_type=SnapshotTargetType.ALIAS,
target=target,
)
if target not in branches and target not in unfetched_refs:
......@@ -698,12 +703,12 @@ class GitLoader(BaseGitLoader):
branch.target: branch
for base_snapshot in reversed(self.base_snapshots)
for branch in base_snapshot.branches.values()
if branch and branch.target_type != TargetType.ALIAS
if branch and branch.target_type != SnapshotTargetType.ALIAS
}
assert all(
base_snapshot_reverse_branches[branch.target] == branch
for branch in self.prev_snapshot.branches.values()
if branch and branch.target_type != TargetType.ALIAS
if branch and branch.target_type != SnapshotTargetType.ALIAS
), "base_snapshot_reverse_branches is not a superset of prev_snapshot"
for ref_name, target in unfetched_refs.items():
......@@ -725,10 +730,13 @@ class GitLoader(BaseGitLoader):
targets_unknown = set(refs_for_target)
for method, target_type in (
(self.storage.revision_missing, TargetType.REVISION),
(self.storage.release_missing, TargetType.RELEASE),
(self.storage.directory_missing, TargetType.DIRECTORY),
(self.storage.content_missing_per_sha1_git, TargetType.CONTENT),
(self.storage.revision_missing, SnapshotTargetType.REVISION),
(self.storage.release_missing, SnapshotTargetType.RELEASE),
(self.storage.directory_missing, SnapshotTargetType.DIRECTORY),
(
self.storage.content_missing_per_sha1_git,
SnapshotTargetType.CONTENT,
),
):
missing = set(method(list(targets_unknown)))
known = targets_unknown - missing
......
# Copyright (C) 2015-2022 The Software Heritage developers
# Copyright (C) 2015-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -10,7 +10,7 @@ import shutil
import subprocess
import tempfile
import dulwich.objects
from dulwich.objects import Commit, Tag, Tree
import dulwich.repo
import pytest
......@@ -85,7 +85,6 @@ class SWHObjectType:
self.type_name = type_name
@pytest.mark.fs
class TestConverters:
@classmethod
def setup_class(cls):
......@@ -171,7 +170,7 @@ class TestConverters:
def test_corrupt_tree(self):
sha1 = b"a9b41fc6347d778f16c4380b598d8083e9b4c1fb"
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
tree = dulwich.objects.Tree()
tree = Tree()
tree.add(b"file1", 0o644, target)
assert tree.sha().hexdigest() == sha1.decode()
converters.dulwich_tree_to_directory(tree)
......@@ -196,7 +195,7 @@ class TestConverters:
b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce"
)
tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string)
tree = Tree.from_raw_string(Tree.type_name, raw_string)
assert converters.dulwich_tree_to_directory(tree) == Directory(
entries=(
......@@ -233,7 +232,7 @@ class TestConverters:
(b"tree_normal", 0o040000, "dir"),
]
tree = dulwich.objects.Tree()
tree = Tree()
for name, mode, _ in entries:
tree.add(name, mode, b"00" * 20)
......@@ -258,7 +257,7 @@ class TestConverters:
b"\x1d\xd3\xec\x83\x94+\xbc\x04\xde\xee\x7f\xc6\xbe\x8b\x9cnp=W\xf9"
)
tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string)
tree = Tree.from_raw_string(Tree.type_name, raw_string)
dir_ = Directory(
entries=(
......@@ -387,7 +386,7 @@ class TestConverters:
author = Person(
fullname=b"Foo <foo@example.org>", name=b"Foo", email=b"foo@example.org"
)
commit = dulwich.objects.Commit()
commit = Commit()
commit.tree = target
commit.message = message
commit.author = commit.committer = b"Foo <foo@example.org>"
......@@ -417,7 +416,7 @@ class TestConverters:
sha = hash_to_bytes("3f0ac5a6d15d89cf928209a57334e3b77c5651b9")
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some commit message"
commit = dulwich.objects.Commit()
commit = Commit()
commit.tree = target
commit.message = message
commit.gpgsig = GPGSIG
......@@ -497,7 +496,7 @@ class TestConverters:
b"committer Foo <foo@example.org> 1640191028 +0200\n\n"
b"some commit message"
)
commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string)
commit = Commit.from_raw_string(Commit.type_name, raw_string)
date = TimestampWithTimezone(
timestamp=Timestamp(seconds=1640191028, microseconds=0),
offset_bytes=b"+0200",
......@@ -520,7 +519,7 @@ class TestConverters:
# Mess with the offset
raw_string2 = raw_string.replace(b"+0200", b"+200")
commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2)
commit = Commit.from_raw_string(Commit.type_name, raw_string2)
date = TimestampWithTimezone(
timestamp=Timestamp(seconds=1640191028, microseconds=0),
offset_bytes=b"+200",
......@@ -546,7 +545,7 @@ class TestConverters:
b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce",
b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE",
)
commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2)
commit = Commit.from_raw_string(Commit.type_name, raw_string2)
date = TimestampWithTimezone(
timestamp=Timestamp(seconds=1640191028, microseconds=0),
offset_bytes=b"+0200",
......@@ -567,6 +566,69 @@ class TestConverters:
raw_manifest=b"commit 161\x00" + raw_string2,
)
def test_commit_timestamp_overflow(self):
"""Checks raw_manifest is set when the commit cannot fit the data model"""
# Well-formed manifest
raw_string = (
b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
b"author Foo <foo@example.org> 99999999999999999 +0200\n"
b"committer Foo <foo@example.org> 99999999999999999 +0200\n\n"
b"some commit message"
)
commit = Commit.from_raw_string(Commit.type_name, raw_string)
date = TimestampWithTimezone(
timestamp=Timestamp(seconds=0, microseconds=0),
offset_bytes=b"+0200",
)
assert converters.dulwich_commit_to_revision(commit) == Revision(
message=b"some commit message",
directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
synthetic=False,
author=Person.from_fullname(
b"Foo <foo@example.org>",
),
committer=Person.from_fullname(
b"Foo <foo@example.org>",
),
date=date,
committer_date=date,
type=RevisionType.GIT,
raw_manifest=b"commit 175\x00" + raw_string,
)
def test_commit_timestamp_large_offset(self):
"""Checks commits with an offset too large to fit in :class:`datetime` can
still be parsed."""
# Well-formed manifest
raw_string = (
b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
b"author Foo <foo@example.org> 1640191028 +99999999\n"
b"committer Foo <foo@example.org> 1640191028 +99999999\n\n"
b"some commit message"
)
commit = Commit.from_raw_string(Commit.type_name, raw_string)
date = TimestampWithTimezone(
timestamp=Timestamp(seconds=1640191028, microseconds=0),
offset_bytes=b"+99999999",
)
assert converters.dulwich_commit_to_revision(commit) == Revision(
message=b"some commit message",
directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
synthetic=False,
author=Person.from_fullname(
b"Foo <foo@example.org>",
),
committer=Person.from_fullname(
b"Foo <foo@example.org>",
),
date=date,
committer_date=date,
type=RevisionType.GIT,
raw_manifest=None,
)
def test_author_line_to_author(self):
# edge case out of the way
with pytest.raises(TypeError):
......@@ -612,9 +674,9 @@ class TestConverters:
sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136")
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
tag = dulwich.objects.Tag()
tag = Tag()
tag.name = b"blah"
tag.object = (dulwich.objects.Commit, target)
tag.object = (Commit, target)
tag.message = message
tag.signature = None
tag.tagger = None
......@@ -650,9 +712,9 @@ class TestConverters:
datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp()
)
tag = dulwich.objects.Tag()
tag = Tag()
tag.name = b"blah"
tag.object = (dulwich.objects.Commit, target)
tag.object = (Commit, target)
tag.message = message
tag.signature = None
tag.tagger = tagger
......@@ -694,9 +756,9 @@ class TestConverters:
tagger = b"hey dude <hello@mail.org>"
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
tag = dulwich.objects.Tag()
tag = Tag()
tag.name = b"blah"
tag.object = (dulwich.objects.Commit, target)
tag.object = (Commit, target)
tag.message = message
tag.signature = None
tag.tagger = tagger
......@@ -735,9 +797,9 @@ class TestConverters:
date = int(
datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc).timestamp()
)
tag = dulwich.objects.Tag()
tag = Tag()
tag.name = b"blah"
tag.object = (dulwich.objects.Commit, target)
tag.object = (Commit, target)
tag.message = message
tag.signature = None
tag.tagger = tagger
......@@ -777,9 +839,9 @@ class TestConverters:
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
tag = dulwich.objects.Tag()
tag = Tag()
tag.name = b"blah"
tag.object = (dulwich.objects.Commit, target)
tag.object = (Commit, target)
tag.message = message
tag.signature = GPGSIG
tag.tagger = None
......@@ -810,9 +872,9 @@ class TestConverters:
sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
tag = dulwich.objects.Tag()
tag = Tag()
tag.name = b"blah"
tag.object = (dulwich.objects.Commit, target)
tag.object = (Commit, target)
tag.message = message
tag.signature = GPGSIG
tag.tagger = None
......@@ -847,7 +909,7 @@ class TestConverters:
b"tagger Foo <foo@example.org> 1640191027 +0200\n\n"
b"some release message"
)
tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string)
tag = Tag.from_raw_string(Tag.type_name, raw_string)
assert converters.dulwich_tag_to_release(tag) == Release(
name=b"blah",
message=b"some release message",
......@@ -866,7 +928,7 @@ class TestConverters:
# Mess with the offset (negative UTC)
raw_string2 = raw_string.replace(b"+0200", b"-0000")
tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2)
tag = Tag.from_raw_string(Tag.type_name, raw_string2)
assert converters.dulwich_tag_to_release(tag) == Release(
name=b"blah",
message=b"some release message",
......@@ -884,7 +946,7 @@ class TestConverters:
# Mess with the offset (other)
raw_string2 = raw_string.replace(b"+0200", b"+200")
tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2)
tag = Tag.from_raw_string(Tag.type_name, raw_string2)
assert converters.dulwich_tag_to_release(tag) == Release(
name=b"blah",
message=b"some release message",
......@@ -905,7 +967,7 @@ class TestConverters:
b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce",
b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE",
)
tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2)
tag = Tag.from_raw_string(Tag.type_name, raw_string2)
assert converters.dulwich_tag_to_release(tag) == Release(
name=b"blah",
message=b"some release message",
......
......@@ -69,7 +69,7 @@ def test_list_git_tree(datadir, tmp_path):
assert empty_bar_found is True
dir2 = Directory.from_disk(path=repo_path, dir_filter=list_git_tree)
dir2 = Directory.from_disk(path=repo_path, path_filter=list_git_tree)
dir2_entries = [d["name"] for d in dir2.entries]
assert b".git" not in dir2_entries
assert b"empty-foo" not in dir2_entries
......@@ -177,8 +177,6 @@ def test_git_loader_directory(swh_storage, datadir, tmp_path, reference):
checksums=checksums,
)
assert loader.dir_filter == list_git_tree
actual_result = loader.load()
assert actual_result == {"status": "eventful"}
......@@ -229,7 +227,7 @@ def test_loader_git_directory_hash_mismatch(swh_storage, datadir, tmp_path):
actual_result = loader.load()
# Ingestion fails because the checks failed
assert actual_result == {"status": "failed"}
assert actual_result["status"] == "failed"
assert get_stats(swh_storage) == {
"content": 0,
"directory": 0,
......@@ -338,6 +336,11 @@ def test_loader_git_directory_without_or_with_submodule(
check=True,
cwd=repo.path,
)
run(
["git", "config", "commit.gpgsign", "false"],
check=True,
cwd=repo.path,
)
# add the repository served by the simple_git_repository_url fixture as a
# submodule in it
run(
......
......@@ -22,7 +22,13 @@ from swh.loader.tests import (
prepare_repository_from_archive,
)
from swh.model.hashutil import bytehex_to_hash, hash_to_bytes
from swh.model.model import ObjectType, Release, Snapshot, SnapshotBranch, TargetType
from swh.model.model import (
ObjectType,
Release,
Snapshot,
SnapshotBranch,
SnapshotTargetType,
)
from swh.storage.algos.snapshot import snapshot_get_all_branches
SNAPSHOT1 = Snapshot(
......@@ -30,27 +36,27 @@ SNAPSHOT1 = Snapshot(
branches={
b"HEAD": SnapshotBranch(
target=b"refs/heads/master",
target_type=TargetType.ALIAS,
target_type=SnapshotTargetType.ALIAS,
),
b"refs/heads/master": SnapshotBranch(
target=hash_to_bytes("2f01f5ca7e391a2f08905990277faf81e709a649"),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
),
b"refs/heads/branch1": SnapshotBranch(
target=hash_to_bytes("b0a77609903f767a2fd3d769904ef9ef68468b87"),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
),
b"refs/heads/branch2": SnapshotBranch(
target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
),
b"refs/tags/branch2-after-delete": SnapshotBranch(
target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
),
b"refs/tags/branch2-before-delete": SnapshotBranch(
target=hash_to_bytes("1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b"),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
),
},
)
......@@ -177,7 +183,7 @@ class CommonGitLoaderTests:
self.loader.get_contents = None
res = self.loader.load()
assert res == {"status": "failed"}
assert res["status"] == "failed"
assert_last_visit_matches(
self.loader.storage,
......@@ -199,7 +205,7 @@ class CommonGitLoaderTests:
)
res = self.loader.load()
assert res == {"status": "failed"}
assert res["status"] == "failed"
assert_last_visit_matches(
self.loader.storage,
......@@ -220,18 +226,21 @@ class CommonGitLoaderTests:
branches={interesting_branch: SNAPSHOT1.branches[interesting_branch]}
)
with patch.object(
utils,
"ignore_branch_name",
lambda name: name != interesting_branch,
), patch.object(
utils,
"filter_refs",
lambda refs: {
ref_name: utils.HexBytes(target)
for ref_name, target in refs.items()
if ref_name == interesting_branch
},
with (
patch.object(
utils,
"ignore_branch_name",
lambda name: name != interesting_branch,
),
patch.object(
utils,
"filter_refs",
lambda refs: {
ref_name: utils.HexBytes(target)
for ref_name, target in refs.items()
if ref_name == interesting_branch
},
),
):
# Ensure that only the interesting branch is loaded
res = self.loader.load()
......@@ -322,11 +331,11 @@ class FullGitLoaderTests(CommonGitLoaderTests):
branches = snapshot.branches
assert branches[b"HEAD"] == SnapshotBranch(
target=b"refs/heads/master",
target_type=TargetType.ALIAS,
target_type=SnapshotTargetType.ALIAS,
)
assert branches[b"refs/heads/master"] == SnapshotBranch(
target=hash_to_bytes(new_revision),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
)
# Merge branch1 into HEAD.
......@@ -380,11 +389,11 @@ class FullGitLoaderTests(CommonGitLoaderTests):
merge_branches = merge_snapshot.branches
assert merge_branches[b"HEAD"] == SnapshotBranch(
target=b"refs/heads/master",
target_type=TargetType.ALIAS,
target_type=SnapshotTargetType.ALIAS,
)
assert merge_branches[b"refs/heads/master"] == SnapshotBranch(
target=hash_to_bytes(merge_commit.decode()),
target_type=TargetType.REVISION,
target_type=SnapshotTargetType.REVISION,
)
def test_load_filter_branches(self):
......@@ -438,7 +447,7 @@ class FullGitLoaderTests(CommonGitLoaderTests):
assert branches[b"HEAD"] == SnapshotBranch(
target=b"refs/heads/dangling-branch",
target_type=TargetType.ALIAS,
target_type=SnapshotTargetType.ALIAS,
)
assert branches[b"refs/heads/dangling-branch"] is None
......@@ -507,7 +516,7 @@ class FullGitLoaderTests(CommonGitLoaderTests):
branches = self.loader.storage.snapshot_get_branches(self.loader.snapshot.id)
branch = branches["branches"][b"refs/tags/v1.0.0"]
assert branch.target_type == TargetType.RELEASE
assert branch.target_type == SnapshotTargetType.RELEASE
release = self.loader.storage.release_get([branch.target])[0]
assert release.date is not None
......@@ -544,7 +553,7 @@ class FullGitLoaderTests(CommonGitLoaderTests):
branches = self.loader.storage.snapshot_get_branches(self.loader.snapshot.id)
branch = branches["branches"][b"refs/tags/v1.0.0"]
assert branch.target_type == TargetType.RELEASE
assert branch.target_type == SnapshotTargetType.RELEASE
release = self.loader.storage.release_get([branch.target])[0]
assert release == Release(
......
# Copyright (C) 2018-2023 The Software Heritage developers
# Copyright (C) 2018-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -13,12 +13,13 @@ import subprocess
import sys
from tempfile import SpooledTemporaryFile
from threading import Thread
import time
from unittest.mock import MagicMock, call
import attr
from dulwich.errors import GitProtocolError, NotGitRepository, ObjectFormatException
from dulwich.pack import REF_DELTA
from dulwich.porcelain import push
from dulwich.porcelain import get_user_timezones, push
import dulwich.repo
from dulwich.tests.utils import build_pack
import pytest
......@@ -33,6 +34,7 @@ from swh.loader.tests import (
get_stats,
prepare_repository_from_archive,
)
from swh.model.hashutil import hash_to_bytes
from swh.model.model import (
MetadataAuthority,
MetadataAuthorityType,
......@@ -42,6 +44,8 @@ from swh.model.model import (
OriginVisitStatus,
RawExtrinsicMetadata,
Snapshot,
SnapshotBranch,
SnapshotTargetType,
)
......@@ -71,9 +75,9 @@ class CommonGitLoaderNotFound:
"swh.loader.git.loader.GitLoader.fetch_pack_from_origin"
).side_effect = failure_exception
self.mocker.patch(
"swh.loader.git.loader.dumb.check_protocol"
).side_effect = HTTPError("404 not found")
self.mocker.patch("swh.loader.git.loader.dumb.check_protocol").side_effect = (
HTTPError("404 not found")
)
res = self.loader.load()
assert res == {"status": "uneventful"}
......@@ -108,7 +112,7 @@ class CommonGitLoaderNotFound:
mock.side_effect = failure_exception
res = self.loader.load()
assert res == {"status": "failed"}
assert res["status"] == "failed"
assert_last_visit_matches(
self.loader.storage,
......@@ -293,8 +297,12 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoaderNotFound):
def add_tag(tag_name, tag_message, commit):
tag = dulwich.objects.Tag()
tag.name = tag_name
tag.tagger = b"John Doe <john.doe@example.org>"
tag.message = tag_message
tag.object = (dulwich.objects.Commit, commit)
tag.tag_time = int(time.time())
tag.tag_timezone = get_user_timezones()[0]
tag.check()
self.repo.object_store.add_object(tag)
self.repo[b"refs/tags/" + tag_name] = tag.id
return tag
......@@ -350,18 +358,30 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoaderNotFound):
# get all object ids that will be in storage after third load
objects_third_load = set(iter(self.repo.object_store))
# create a pack file containing full objects for newly added blob, tree,
# commit and tag in latest commit but also external references to objects
# that were discovered during the second loading of the repository
# create a pack file containing deltified objects for newly added blob, tree,
# commit and tag in latest commit whose bases are external objects that were
# discovered during the second loading of the repository
objects = []
new_objects_second_load = objects_second_load - objects_first_load
new_objects_third_load = objects_third_load - objects_second_load
for obj_id in new_objects_third_load:
obj = self.repo.object_store[obj_id]
objects.append((obj.type_num, obj.as_raw_string()))
for obj_id in new_objects_second_load:
obj = self.repo.object_store[obj_id]
objects.append((REF_DELTA, (obj_id, obj.as_raw_string())))
new_objects_second_load = [
self.repo.object_store[obj_id]
for obj_id in (objects_second_load - objects_first_load)
]
new_objects_third_load = [
self.repo.object_store[obj_id]
for obj_id in (objects_third_load - objects_second_load)
]
for new_obj in new_objects_third_load:
base_obj = next(
obj
for obj in new_objects_second_load
if obj.type_num == new_obj.type_num
)
objects.append(
(
REF_DELTA,
(base_obj.id, new_obj.as_raw_string()),
)
)
buffer = io.BytesIO()
build_pack(buffer, objects, self.repo.object_store)
......@@ -388,11 +408,11 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoaderNotFound):
corrupted_release = attr.evolve(release, id=b"\x00" * 20)
release_get = mocker.patch.object(self.loader.storage, "release_get")
release_get.return_value = [corrupted_release]
assert self.loader.load() == {"status": "failed"}
assert self.loader.load()["status"] == "failed"
elif missing_object:
revision_get = mocker.patch.object(self.loader.storage, "revision_get")
revision_get.return_value = [None]
assert self.loader.load() == {"status": "failed"}
assert self.loader.load()["status"] == "failed"
assert list(
sorted(
[c for c in statsd_calls if c[1][0] == statsd_metric],
......@@ -440,7 +460,7 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoaderNotFound):
# set max pack size to a really small value
self.loader.pack_size_bytes = 10
res = self.loader.load()
assert res == {"status": "failed"}
assert res["status"] == "failed"
assert sentry_events
assert sentry_events[0]["level"] == "error"
assert sentry_events[0]["exception"]["values"][0]["value"].startswith(
......@@ -506,14 +526,14 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
self.repo_url,
allowed_statuses=None,
require_snapshot=True,
type=None,
type="git",
),
# As it does not already have a snapshot, fall back to the parent origin
call(
f"base://{self.repo_url}",
allowed_statuses=None,
require_snapshot=True,
type=None,
type="git",
),
]
......@@ -579,14 +599,14 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
self.repo_url,
allowed_statuses=None,
require_snapshot=True,
type=None,
type="git",
),
# As it does not already have a snapshot, fall back to the parent origin
call(
f"base://{self.repo_url}",
allowed_statuses=None,
require_snapshot=True,
type=None,
type="git",
),
]
......@@ -636,7 +656,7 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
# Tries the same origin, and finds a snapshot
call(
self.repo_url,
type=None,
type="git",
allowed_statuses=None,
require_snapshot=True,
),
......@@ -644,7 +664,7 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
# since the last visit
call(
f"base://{self.repo_url}",
type=None,
type="git",
allowed_statuses=None,
require_snapshot=True,
),
......@@ -762,14 +782,14 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
self.repo_url,
allowed_statuses=None,
require_snapshot=True,
type=None,
type="git",
),
# As it does not already have a snapshot, fall back to the parent origin
call(
f"base://{self.repo_url}",
allowed_statuses=None,
require_snapshot=True,
type=None,
type="git",
),
]
......@@ -957,6 +977,42 @@ class DumbGitLoaderTestBase(FullGitLoaderTests):
assert b"HEAD" in self.loader.snapshot.branches
assert self.loader.snapshot.branches[b"HEAD"].target == b"refs/heads/master"
def test_load_refs_targeting_tree_or_blob(self, mocker):
known_tree = "fbf70528223d263661b5ad4b80f26caf3860eb8e"
known_blob = "534d61ecee4f6da4d6ca6ddd8abf258208d2d1bc"
tree_ref = "refs/tree"
blob_ref = "refs/blob"
class GitObjectsFetcherTreeAndBlobRefs(dumb.GitObjectsFetcher):
def _http_get(self, path: str) -> SpooledTemporaryFile:
buffer = super()._http_get(path)
if path == "info/refs":
# Add two refs targeting blob and tree in the refs list
refs = buffer.read().decode("utf-8")
buffer.seek(0)
buffer.write(
(
f"{known_tree}\t{tree_ref}\n"
f"{known_blob}\t{blob_ref}\n" + refs
).encode()
)
buffer.flush()
buffer.seek(0)
return buffer
mocker.patch.object(dumb, "GitObjectsFetcher", GitObjectsFetcherTreeAndBlobRefs)
res = self.loader.load()
assert res == {"status": "eventful"}
assert self.loader.snapshot.branches[tree_ref.encode()] == SnapshotBranch(
target=hash_to_bytes(known_tree), target_type=SnapshotTargetType.DIRECTORY
)
assert self.loader.snapshot.branches[blob_ref.encode()] == SnapshotBranch(
target=hash_to_bytes(known_blob), target_type=SnapshotTargetType.CONTENT
)
class TestDumbGitLoaderWithPack(DumbGitLoaderTestBase):
@classmethod
......@@ -993,7 +1049,7 @@ class TestDumbGitLoaderWithPack(DumbGitLoaderTestBase):
def test_http_get_retry(self, mocker, requests_mock):
requests_mock.real_http = True
sleep = mocker.patch.object(dumb.GitObjectsFetcher._http_get.retry, "sleep")
sleep = mocker.patch("time.sleep")
nb_files = 0
......@@ -1023,7 +1079,7 @@ class TestDumbGitLoaderWithPack(DumbGitLoaderTestBase):
assert res == {"status": "eventful"}
sleep.assert_has_calls([mocker.call(param) for param in [1] * nb_files])
sleep = mocker.patch.object(dumb.check_protocol.retry, "sleep")
sleep = mocker.patch("time.sleep")
with open(os.path.join(self.bare_repo_path, "info/refs"), "rb") as refs_data:
requests_mock.get(
f"{self.repo_url}/info/refs",
......@@ -1046,7 +1102,7 @@ class TestDumbGitLoaderWithPack(DumbGitLoaderTestBase):
# set max pack size to a really small value
self.loader.pack_size_bytes = 10
res = self.loader.load()
assert res == {"status": "failed"}
assert res["status"] == "failed"
assert sentry_events
assert sentry_events[0]["level"] == "error"
assert sentry_events[0]["exception"]["values"][0]["value"].startswith(
......@@ -1104,13 +1160,13 @@ def test_loader_too_large_pack_file_for_github_origin(
return_value=[metadata],
)
assert loader.load() == {"status": "failed"}
assert loader.load()["status"] == "failed"
assert sentry_events
assert sentry_events[0]["level"] == "error"
assert sentry_events[0]["exception"]["values"][0]["value"] == (
f"Pack file too big for repository {repo_url}, "
f"limit is {loader.pack_size_bytes} bytes, current size is {big_size_kib*1024}"
f"limit is {loader.pack_size_bytes} bytes, current size is {big_size_kib * 1024}"
)
......
......@@ -22,16 +22,18 @@ commands =
[testenv:black]
skip_install = true
deps =
black==23.1.0
black==25.1.0
commands =
{envpython} -m black --check swh
[testenv:flake8]
skip_install = true
deps =
flake8==5.0.4
flake8-bugbear==22.9.23
pycodestyle==2.9.1
flake8==7.1.1
flake8-bugbear==24.12.12
flake8-pyproject==1.2.3
pycodestyle==2.12.1
commands =
{envpython} -m flake8
......@@ -39,7 +41,7 @@ commands =
extras =
testing
deps =
mypy==1.8.0
mypy==1.15.0
commands =
mypy swh
......