Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-lister
  • vlorentz/swh-lister
  • KShivendu/swh-lister
  • franckbret/swh-lister
  • lunar/swh-lister
  • ardumont/swh-lister
  • olasd/swh-lister
  • swh/devel/swh-lister
  • douardda/swh-lister
  • charly/swh-lister
  • marmoute/swh-lister
11 results
Show changes
......@@ -5,17 +5,25 @@
from pathlib import Path
from dulwich import porcelain
import iso8601
from swh.lister.julia.lister import JuliaLister
from swh.lister.julia.tests import prepare_repository_from_archive
expected_origins = [
"https://github.com/leios/Fable.jl.git",
"https://github.com/oscar-system/Oscar.jl.git",
]
expected_origins_0 = {
"https://github.com/leios/Fable.jl.git": "2001-01-02T17:18:19+00:00",
"https://github.com/oscar-system/Oscar.jl.git": "2001-01-03T17:18:19+00:00",
}
expected_origins_1 = {
"https://github.com/oscar-system/Oscar.jl.git": "2001-01-04T17:18:19+00:00",
"https://github.com/serenity4/VulkanSpec.jl.git": "2001-01-05T17:18:19+00:00",
}
def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz")
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
......@@ -33,17 +41,18 @@ def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler):
def test_julia_lister(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz")
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
res = lister.run()
assert res.origins == 1 + 1
assert res.origins == len(expected_origins_0)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert len(scheduler_origins) == len(expected_origins_0)
assert {
(
scheduled.visit_type,
......@@ -51,4 +60,106 @@ def test_julia_lister(datadir, tmp_path, swh_scheduler):
scheduled.last_update,
)
for scheduled in scheduler_origins
} == {("git", expected, None) for expected in expected_origins}
} == {
("git", origin, iso8601.parse_date(last_update))
for origin, last_update in expected_origins_0.items()
}
def test_julia_lister_incremental(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
# Prepare first run
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
# Latest Git commit hash expected
with porcelain.open_repo_closing(lister.REPO_PATH) as r:
expected_last_seen_commit = r.head().decode("ascii")
assert expected_last_seen_commit is not None
assert lister.state.last_seen_commit is None
# First run
res = lister.run()
assert res.pages == 1
assert res.origins == len(expected_origins_0)
assert lister.state.last_seen_commit == expected_last_seen_commit
scheduler_origins_0 = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins_0) == len(expected_origins_0)
assert {
(
scheduled.visit_type,
scheduled.url,
scheduled.last_update,
)
for scheduled in scheduler_origins_0
} == {
("git", origin, iso8601.parse_date(last_update))
for origin, last_update in expected_origins_0.items()
}
# Prepare second run
archive_path = Path(datadir, "fake-julia-registry-repository_1.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
assert lister.state.last_seen_commit == expected_last_seen_commit
with porcelain.open_repo_closing(lister.REPO_PATH) as repo:
new_expected_last_seen_commit = repo.head().decode("ascii")
assert expected_last_seen_commit != new_expected_last_seen_commit
# Second run
res = lister.run()
assert lister.state.last_seen_commit == new_expected_last_seen_commit
assert res.pages == 1
# One new package, one new version
assert res.origins == len(expected_origins_1)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
expected_origins = {**expected_origins_0, **expected_origins_1}
assert len(scheduler_origins) == len(expected_origins)
def test_julia_lister_incremental_no_changes(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
# Latest Git commit hash expected
with porcelain.open_repo_closing(lister.REPO_PATH) as r:
expected_last_seen_commit = r.head().decode("ascii")
assert expected_last_seen_commit is not None
assert lister.state.last_seen_commit is None
# First run
res = lister.run()
assert res.pages == 1
assert res.origins == len(expected_origins_0)
assert expected_last_seen_commit is not None
assert lister.state.last_seen_commit == expected_last_seen_commit
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins_0)
# Prepare second run, repository state is the same as the one of the first run
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
assert lister.state.last_seen_commit == expected_last_seen_commit
# Second run
res = lister.run()
assert lister.state.last_seen_commit == expected_last_seen_commit
assert res.pages == 1
# Nothing new
assert res.origins == 0
......@@ -111,6 +111,8 @@ class Artifact:
"""Checksum layout mode to provide to loaders (e.g. nar, standard, ...)"""
ref: Optional[str]
"""Optional reference on the artifact (git commit, branch, svn commit, tag, ...)"""
submodules: bool
"""Indicates if submodules should be retrieved for a git-checkout visit type"""
@dataclass
......@@ -142,10 +144,11 @@ POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
def url_endswith(
def url_contains_tarball_filename(
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
) -> bool:
"""Determine whether urlparsed ends with one of the extensions passed as parameter.
"""Determine whether urlparsed contains a tarball filename ending with one of the
extensions passed as parameter, path parts and query parameters are checked.
This also account for the edge case of a filename with only a version as name (so no
extension in the end.)
......@@ -156,11 +159,15 @@ def url_endswith(
"""
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
match = any(
path_part.endswith(tuple(extensions))
for path in paths
for path_part in path.parts
)
if match:
return match
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
# to catch those
name = Path(urlparsed.path).name
......@@ -212,7 +219,7 @@ def is_tarball(
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
return url_contains_tarball_filename(urlparsed, TARBALL_EXTENSIONS)
# Check all urls and as soon as an url allows the nature detection, this stops.
exceptions_to_raise = []
......@@ -281,7 +288,7 @@ def is_tarball(
break
return (
url_endswith(
url_contains_tarball_filename(
urlparse(filename),
TARBALL_EXTENSIONS,
raise_when_no_extension=False,
......@@ -472,6 +479,7 @@ class NixGuixLister(StatelessLister[PageResult]):
checksum_layout=MAPPING_CHECKSUM_LAYOUT[outputHashMode],
visit_type=VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type],
ref=plain_ref,
submodules=artifact.get("submodule", False),
)
elif artifact_type == "url":
......@@ -582,7 +590,7 @@ class NixGuixLister(StatelessLister[PageResult]):
# Let's check and filter it out if it is to be ignored (if possible).
# Some origin urls may not have extension at this point (e.g
# http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
if url_endswith(
if url_contains_tarball_filename(
urlparse(origin),
self.extensions_to_ignore,
raise_when_no_extension=False,
......@@ -605,6 +613,7 @@ class NixGuixLister(StatelessLister[PageResult]):
checksum_layout=MAPPING_CHECKSUM_LAYOUT[outputHashMode],
visit_type="tarball-directory" if is_tar else "content",
ref=None,
submodules=False,
)
else:
logger.warning(
......@@ -626,13 +635,15 @@ class NixGuixLister(StatelessLister[PageResult]):
def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]:
"""Given an artifact (tarball, file), yield one ListedOrigin."""
assert self.lister_obj.id is not None
loader_arguments = {
loader_arguments: Dict[str, Any] = {
"checksums": artifact.checksums,
"checksum_layout": artifact.checksum_layout.value,
"fallback_urls": artifact.fallback_urls,
}
if artifact.ref:
loader_arguments["ref"] = artifact.ref
if artifact.submodules:
loader_arguments["submodules"] = artifact.submodules
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=artifact.origin,
......
......@@ -2,17 +2,23 @@
"sources": [
{
"type": "url",
"urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
"urls": [
"https://github.com/owner-1/repository-1/revision-1.tgz"
],
"integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
},
{
"type": "url",
"urls": [ "https://github.com/owner-3/repository-1/revision-1.tar" ],
"urls": [
"https://github.com/owner-3/repository-1/revision-1.tar"
],
"integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
},
{
"type": "url",
"urls": [ "https://example.com/file.txt" ],
"urls": [
"https://example.com/file.txt"
],
"integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM="
},
{
......@@ -99,7 +105,9 @@
},
{
"type": "url",
"urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],
"urls": [
"svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"
],
"integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk="
},
{
......@@ -311,8 +319,17 @@
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"svn_revision": "1550"
},
{
"type": "git",
"git_url": "https://github.com/supercollider/supercollider",
"integrity": "sha256-YSXpITazkV/IqMquPfj0hC7oRS2yH399IFJU4qmyd7Y=",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"git_ref": "Version-3.13.0",
"submodule": true
}
],
"version": "1",
"revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
}
}
\ No newline at end of file
......@@ -24,7 +24,7 @@ from swh.lister.nixguix.lister import (
ArtifactWithoutExtension,
NixGuixLister,
is_tarball,
url_endswith,
url_contains_tarball_filename,
)
from swh.lister.pattern import ListerStats
......@@ -65,7 +65,7 @@ def test_url_endswith(name, expected_result):
"""It should detect whether url or query params of the urls ends with extensions"""
urlparsed = urlparse(f"https://example.org/{name}")
assert (
url_endswith(
url_contains_tarball_filename(
urlparsed,
TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
raise_when_no_extension=False,
......@@ -81,7 +81,7 @@ def test_url_endswith_raise(name):
"""It should raise when the tested url has no extension"""
urlparsed = urlparse(f"https://example.org/{name}")
with pytest.raises(ArtifactWithoutExtension):
url_endswith(urlparsed, ["unimportant"])
url_contains_tarball_filename(urlparsed, ["unimportant"])
@pytest.mark.parametrize(
......@@ -98,12 +98,15 @@ def test_is_tarball_simple(tarballs):
@pytest.mark.parametrize(
"query_param",
["file", "f", "url", "name", "anykeyreally"],
"url",
[
"https://example.org/download/one.tar.gz/other/path/parts",
"https://example.org/download.php?foo=bar&file=one.tar.gz",
],
)
def test_is_tarball_not_so_simple(query_param):
"""More involved check on tarball should discriminate between tarball and file"""
url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz"
def test_is_tarball_not_so_simple(url):
"""Detect tarball URL when filename is not in the last path parts or
in a query parameter"""
is_tar, origin = is_tarball([url])
assert is_tar is True
assert origin == url
......@@ -225,6 +228,10 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"https://api.github.com/repos/trie/trie",
[{"json": {"html_url": "https://github.com/trie/trie.git"}}],
)
requests_mock.get(
"https://api.github.com/repos/supercollider/supercollider",
[{"json": {"html_url": "https://github.com/supercollider/supercollider"}}],
)
requests_mock.head(
"http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz",
headers={"Content-Type": "application/gzip; charset=ISO-8859-1"},
......@@ -314,7 +321,7 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
# 3 origins have their recursive hash mentioned, they are sent both as vcs and as
# specific vcs directory to ingest. So they are duplicated with visit_type 'git' and
# 'git-checkout', 'svn' and 'svn-export', 'hg' and 'hg-checkout'.
expected_nb_dictincts_origins = expected_nb_origins - 3
expected_nb_dictincts_origins = expected_nb_origins - 4
# 1 page read is 1 origin
assert listed_result == ListerStats(
......@@ -333,6 +340,7 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"https://example.org/rgerganov/footswitch",
"https://hg.sr.ht/~olly/yoyo",
"svn://svn.savannah.gnu.org/apl/trunk",
"https://github.com/supercollider/supercollider",
]:
duplicated_visit_types.extend(
[
......@@ -342,7 +350,7 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
]
)
assert len(duplicated_visit_types) == 6
assert len(duplicated_visit_types) == 8
assert set(duplicated_visit_types) == {
"git",
"git-checkout",
......@@ -361,6 +369,10 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
if listed_origin.visit_type in {"git-checkout", "svn-export", "hg-checkout"}:
assert listed_origin.extra_loader_arguments["ref"] is not None
if listed_origin.url == "https://github.com/supercollider/supercollider":
assert listed_origin.extra_loader_arguments["submodules"] is True
else:
assert "submodules" not in listed_origin.extra_loader_arguments
mapping_visit_types[listed_origin.visit_type] += 1
......
......@@ -92,6 +92,7 @@ def test_npm_lister_full(
"GET",
lister.API_FULL_LISTING_URL,
params=_url_params(page_size + 1, startkey='""'),
timeout=(120, 60),
),
mocker.call(
"GET",
......@@ -100,6 +101,7 @@ def test_npm_lister_full(
page_size + 1,
startkey=f'"{npm_full_listing_page1["rows"][-1]["id"]}"',
),
timeout=(120, 60),
),
]
)
......@@ -152,6 +154,7 @@ def test_npm_lister_incremental(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(page_size, since="0"),
timeout=(120, 60),
),
mocker.call(
"GET",
......@@ -160,11 +163,13 @@ def test_npm_lister_incremental(
page_size,
since=str(npm_incremental_listing_page1["results"][-1]["seq"]),
),
timeout=(120, 60),
),
mocker.call(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(page_size, since=str(last_seq)),
timeout=(120, 60),
),
]
)
......@@ -204,6 +209,7 @@ def test_npm_lister_incremental_restart(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(page_size, since=str(last_seq)),
timeout=(120, 60),
)
......
......@@ -90,6 +90,10 @@ class Lister(Generic[StateType, PageType]):
max_origins_per_page: the maximum number of origins processed per page
enable_origins: whether the created origins should be enabled or not
record_batch_size: maximum number of records to flush to the scheduler at once.
connect_timeout: requests connection timeout in seconds.
read_timeout: requests read timeout in seconds.
verify_certs: whether to verify the TLS certificates in HTTPS requests.
requests_extra_kwargs: extra keyword arguments to pass to :mod:`requests` calls.
Generic types:
- *StateType*: concrete lister type; should usually be a :class:`dataclass` for
......@@ -113,6 +117,10 @@ class Lister(Generic[StateType, PageType]):
enable_origins: bool = True,
with_github_session: bool = False,
record_batch_size: int = 1000,
connect_timeout: float = 120,
read_timeout: float = 60,
verify_certs: bool = True,
requests_extra_kwargs: Dict[str, Any] = {},
):
if not self.LISTER_NAME:
raise ValueError("Must set the LISTER_NAME attribute on Lister classes")
......@@ -168,6 +176,10 @@ class Lister(Generic[StateType, PageType]):
self.max_origins_per_page = max_origins_per_page
self.enable_origins = enable_origins
self.record_batch_size = record_batch_size
self.requests_extra_kwargs = requests_extra_kwargs
if not verify_certs:
self.requests_extra_kwargs["verify"] = False
self.requests_extra_kwargs["timeout"] = (connect_timeout, read_timeout)
def build_url(self, instance: str) -> str:
"""Optionally build the forge url to list. When the url is not provided in the
......@@ -188,11 +200,14 @@ class Lister(Generic[StateType, PageType]):
raise ValueError("Instance should only be a net location.")
return f"https://{instance}"
def requests_kwargs(self, **kwargs):
return {**self.requests_extra_kwargs, **kwargs}
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, kwargs.get("params"))
response = self.session.request(method, url, **kwargs)
response = self.session.request(method, url, **self.requests_kwargs(**kwargs))
if response.status_code not in (200, 304):
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
......