Skip to content
Snippets Groups Projects
Commit 92d9ada9 authored by Nicolas Dandrimont's avatar Nicolas Dandrimont
Browse files

Implement discovering branch targets from the archive

With the proper implementation of packfile negotiation, remotes can
return packfiles that do not contain all of our wanted objects. Consider
the following two histories:

* c1                                      * c1     ← [refs/tags/original]
↑                                         ↑ :arrow_upper_left:
* c2 ← [refs/heads/main]                  |   * c3 ← [refs/heads/main]
                                          * c2     ← [refs/heads/broken]

The first visit of the origin would load commits c1 and c2, and write a
snapshot referencing c2.

During the second visit, the loader would tell the origin that c2 is
known, and that c1 and c3 are wanted (as new heads). The origin, knowing
that c1 is a parent of c2, would be allowed by the git protocol to send
a packfile containing only c3. Under these circumstances, the loader
cannot tell what object type the snapshot branch
[refs/tags/original] should point to.

The repository in tests has a similar structure ([refs/heads/master] is
in the history of [refs/tags/branch2-before-delete]), so refactor the
incremental load test to exercise this specific behavior. This test can
be moved to the common tests as well.
parent e7988153
Branches debian/unstable-swh
Tags debian/2.1.0-1_swh1
No related merge requests found
......@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from dataclasses import dataclass
import datetime
import logging
......@@ -496,6 +497,47 @@ class GitLoader(BaseGitLoader):
if not branch:
unknown_objects[ref_name] = target
if unknown_objects and self.base_snapshots:
# The remote has sent us a partial packfile. It will have skipped
# objects that it knows are ancestors of the heads we have sent as
# known. We can look these objects up in the archive, as they should
# have had all their ancestors loaded when the previous snapshot was
# loaded.
refs_for_target = defaultdict(list)
for ref_name, target in unknown_objects.items():
refs_for_target[target].append(ref_name)
targets_unknown = set(refs_for_target)
for method, target_type in (
(self.storage.revision_missing, TargetType.REVISION),
(self.storage.release_missing, TargetType.RELEASE),
(self.storage.directory_missing, TargetType.DIRECTORY),
(self.storage.content_missing_per_sha1_git, TargetType.CONTENT),
):
missing = set(method(list(targets_unknown)))
known = targets_unknown - missing
for target in known:
for ref_name in refs_for_target[target]:
logger.debug(
"Inferred type %s for branch %s pointing at unfetched %s",
target_type.name,
ref_name.decode(),
hashutil.hash_to_hex(target),
extra={
"swh_type": "swh_loader_git_inferred_target_type"
},
)
branches[ref_name] = SnapshotBranch(
target=target, target_type=target_type
)
del unknown_objects[ref_name]
targets_unknown = missing
if not targets_unknown:
break
if unknown_objects:
# This object was referenced by the server; We did not fetch
# it, and we do not know it from the previous snapshot. This is
......
......@@ -6,12 +6,14 @@
import copy
import datetime
import os.path
from unittest.mock import patch
import dulwich.objects
import dulwich.porcelain
import dulwich.repo
import pytest
from swh.loader.git import utils
from swh.loader.git.from_disk import GitLoaderFromArchive, GitLoaderFromDisk
from swh.loader.tests import (
assert_last_visit_matches,
......@@ -207,6 +209,52 @@ class CommonGitLoaderTests:
snapshot=None,
)
def test_load_incremental_partial_history(self):
"""Check that loading a partial snapshot, then negotiating a full snapshot, works."""
# We pick this branch because it contains the target of the refs/heads/master
# branch of the full snapshot in its history
interesting_branch = b"refs/tags/branch2-before-delete"
partial_snapshot = Snapshot(
branches={interesting_branch: SNAPSHOT1.branches[interesting_branch]}
)
with patch.object(
utils,
"ignore_branch_name",
lambda name: name != interesting_branch,
), patch.object(
utils,
"filter_refs",
lambda refs: {
ref_name: utils.HexBytes(target)
for ref_name, target in refs.items()
if ref_name == interesting_branch
},
):
# Ensure that only the interesting branch is loaded
res = self.loader.load()
assert res == {"status": "eventful"}
assert self.loader.storage.snapshot_get_branches(partial_snapshot.id)
res = self.loader.load()
assert res == {"status": "eventful"}
stats = get_stats(self.loader.storage)
assert stats == {
"content": 4,
"directory": 7,
"origin": 1,
"origin_visit": 2,
"release": 0,
"revision": 7,
"skipped_content": 0,
"snapshot": 2,
}
class FullGitLoaderTests(CommonGitLoaderTests):
"""Tests for GitLoader (from disk or not). Includes the common ones, and
......
......@@ -6,6 +6,7 @@
import datetime
from functools import partial
from http.server import HTTPServer, SimpleHTTPRequestHandler
import logging
import os
import subprocess
import sys
......@@ -26,14 +27,7 @@ from swh.loader.tests import (
get_stats,
prepare_repository_from_archive,
)
from swh.model.model import (
Origin,
OriginVisit,
OriginVisitStatus,
Snapshot,
SnapshotBranch,
TargetType,
)
from swh.model.model import Origin, OriginVisit, OriginVisitStatus, Snapshot
class CommonGitLoaderNotFound:
......@@ -225,6 +219,26 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoaderNotFound):
"has_parent_origins": False,
}
def test_load_incremental_partial_history(self, caplog):
with caplog.at_level(logging.DEBUG, logger="swh.loader.git.loader"):
super().test_load_incremental_partial_history()
# Check that we've indeed inferred the target type for one of the snapshot
# branches
for record in caplog.records:
if (
hasattr(record, "swh_type")
and record.swh_type == "swh_loader_git_inferred_target_type"
):
assert record.args == (
"REVISION",
"refs/heads/master",
SNAPSHOT1.branches[b"refs/heads/master"].target.hex(),
)
break
else:
assert False, "did not find log message for inferred branch target type"
class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
"""Mostly the same loading scenario but with a ``parent_origin`` different from the
......@@ -564,56 +578,6 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
call("git_known_refs_percent", "h", expected_git_known_refs_percent, {}, 1),
]
def test_load_incremental_negotiation(self):
"""Check that the packfile negotiated when running an incremental load only
contains the "new" commits, and not all objects."""
snapshot_id = b"\x01" * 20
now = datetime.datetime.now(tz=datetime.timezone.utc)
def ovgl(origin_url, allowed_statuses, require_snapshot, type):
if origin_url == f"base://{self.repo_url}":
return OriginVisit(origin=origin_url, visit=42, date=now, type="git")
else:
return None
self.loader.storage.origin_visit_get_latest.side_effect = ovgl
self.loader.storage.origin_visit_status_get_latest.return_value = (
OriginVisitStatus(
origin=f"base://{self.repo_url}",
visit=42,
snapshot=snapshot_id,
date=now,
status="full",
)
)
self.loader.storage.snapshot_get_branches.return_value = {
"id": snapshot_id,
"branches": {
b"refs/heads/master": SnapshotBranch(
# id of the initial commit in the git repository fixture
target=bytes.fromhex("b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a"),
target_type=TargetType.REVISION,
),
},
"next_branch": None,
}
res = self.loader.load()
assert res == {"status": "eventful"}
stats = get_stats(self.loader.storage)
assert stats == {
"content": 3, # instead of 4 for the full repository
"directory": 6, # instead of 7
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 6, # instead of 7
"skipped_content": 0,
"snapshot": 1,
}
class DumbGitLoaderTestBase(FullGitLoaderTests):
"""Prepare a git repository to be loaded using the HTTP dumb transfer protocol."""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment