Skip to content
Snippets Groups Projects
Verified Commit 0756c44e authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Adapt directory loader visit type depending on the vcs tree to ingest

Prior to this, it was sending only 'directory' types for all vcs trees. Multiple
directory loaders now exist whose visit type are currently diverging, so the scheduling
would not happen correctly without it. This commit is the required adaptation for the
scheduling to work appropriately.

Refs. swh/meta#4979
parent 9f252fc8
No related branches found
No related tags found
1 merge request!472Adapt directory loader visit type depending on the vcs tree to ingest
Pipeline #2804 passed
......@@ -312,6 +312,14 @@ VCS_KEYS_MAPPING = {
}
VCS_ARTIFACT_TYPE_TO_VISIT_TYPE = {
"git": "git-checkout",
"hg": "hg-checkout",
"svn": "svn-export",
}
"""Mapping between the vcs artifact type to the loader's visit type."""
class NixGuixLister(StatelessLister[PageResult]):
"""List Guix or Nix sources out of a public json manifest.
......@@ -462,7 +470,7 @@ class NixGuixLister(StatelessLister[PageResult]):
fallback_urls=[],
checksums=checksums,
checksum_layout=MAPPING_CHECKSUM_LAYOUT[outputHashMode],
visit_type="directory",
visit_type=VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type],
ref=plain_ref,
)
......
......@@ -295,6 +295,22 @@
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"integrity": "sha256-yOAaOu/HiG1N/r2tAtdou/fPB+rEJt1TQbIGzQn7/pI="
},
{
"type": "hg",
"hg_url": "https://hg.sr.ht/~olly/yoyo",
"integrity": "sha256-mME9v34RyvpoCATSiLYqN78gtPNK4+1Pj54P2d5KX2A=",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"hg_changeset": "v7.2.0-release"
},
{
"type": "svn",
"svn_url": "svn://svn.savannah.gnu.org/apl/trunk",
"integrity": "sha256-48aUFwjpsAlJ4Kw6oBWW3d57NQwV5igwzr8Ml4Aa7K0=",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"svn_revision": "1550"
}
],
"version": "1",
......
......@@ -18,6 +18,7 @@ from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
DEFAULT_EXTENSIONS_TO_IGNORE,
POSSIBLE_TARBALL_MIMETYPES,
VCS_ARTIFACT_TYPE_TO_VISIT_TYPE,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
ArtifactWithoutExtension,
......@@ -275,21 +276,25 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
expected_visit_types[artifact_type] += 1
outputHashMode = artifact.get("outputHashMode", "flat")
if outputHashMode == "recursive":
# 1 origin of type "directory" is listed in that case too
expected_visit_types["directory"] += 1
# Those are specific
visit_type = VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type]
expected_visit_types[visit_type] += 1
# 1 origin of type `visit_type` is listed in that case too
expected_nb_pages += 1
elif artifact_type == "url":
url = artifact["urls"][0]
if url.endswith(".git"):
expected_visit_types["git"] += 1
visit_type = "git"
elif url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
visit_type = "content"
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1
visit_type = "svn"
elif "crates.io" in url or "codeload.github.com" in url:
expected_visit_types["directory"] += 1
visit_type = "directory"
else: # tarball artifacts
expected_visit_types["directory"] += 1
visit_type = "directory"
expected_visit_types[visit_type] += 1
assert set(expected_visit_types.keys()) == {
"content",
......@@ -297,14 +302,19 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"svn",
"hg",
"directory",
"git-checkout",
"svn-export",
"hg-checkout",
}
listed_result = lister.run()
# Each artifact is considered an origin (even "url" artifacts with mirror urls) but
expected_nb_origins = sum(expected_visit_types.values())
# 1 origin is duplicated for both visit_type 'git' and 'directory'
expected_nb_dictincts_origins = expected_nb_origins - 1
# 3 origins have their recursive hash mentioned, they are sent both as vcs and as
# specific vcs directory to ingest. So they are duplicated with visit_type 'git' and
# 'git-checkout', 'svn' and 'svn-export', 'hg' and 'hg-checkout'.
expected_nb_dictincts_origins = expected_nb_origins - 3
# 1 page read is 1 origin
assert listed_result == ListerStats(
......@@ -316,15 +326,31 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
).results
assert len(scheduler_origins) == expected_nb_origins
# The dataset will trigger 2 listed origins, with 2 distinct visit types
duplicated_url = "https://example.org/rgerganov/footswitch"
duplicated_visit_types = [
origin.visit_type
for origin in scheduler_origins
if origin.url == duplicated_url
]
# The test dataset will trigger some origins duplicated as mentioned above
# Let's check them out
duplicated_visit_types = []
for duplicated_url in [
"https://example.org/rgerganov/footswitch",
"https://hg.sr.ht/~olly/yoyo",
"svn://svn.savannah.gnu.org/apl/trunk",
]:
duplicated_visit_types.extend(
[
origin.visit_type
for origin in scheduler_origins
if origin.url == duplicated_url
]
)
assert set(duplicated_visit_types) == {"git", "directory"}
assert len(duplicated_visit_types) == 6
assert set(duplicated_visit_types) == {
"git",
"git-checkout",
"svn",
"svn-export",
"hg",
"hg-checkout",
}
mapping_visit_types = defaultdict(int)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment