Skip to content
Snippets Groups Projects
Commit 42e76ee6 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

crates: Speedup listing by processing crates in batch

Instead of having a single crate and its versions info per page,
prefer to have up to 1000 crates per page to significantly speedup
the listing process.
parent c6aa490f
No related branches found
No related tags found
1 merge request!532crates: Fixes and improvements
# Copyright (C) 2022-2023 The Software Heritage developers
# Copyright (C) 2022-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -17,6 +17,7 @@ from urllib.parse import urlparse
import iso8601
from looseversion import LooseVersion2
from swh.core.utils import grouper
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -25,7 +26,7 @@ from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
CratesListerPage = List[Dict[str, Any]]
CratesListerPage = List[List[Dict[str, Any]]]
@dataclass
......@@ -198,17 +199,21 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
logger.debug("Found %s crates in crates_index", len(dataset))
# Each entry from dataset will correspond to a page
for name, item in dataset.items():
# a page contains up to 1000 crates with versions info
for crates in grouper(dataset.items(), 1000):
page = []
# sort crate versions
versions = sorted(item["versions"].keys(), key=LooseVersion2)
for name, item in crates:
crate_versions = []
# sort crate versions
versions = sorted(item["versions"].keys(), key=LooseVersion2)
for version in versions:
v = item["versions"][version]
v["name"] = name
v["version"] = version
page.append(self.page_entry_dict(v))
for version in versions:
v = item["versions"][version]
v["name"] = name
v["version"] = version
crate_versions.append(self.page_entry_dict(v))
page.append(crate_versions)
yield page
self.all_crates_processed = True
......@@ -217,45 +222,46 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
"""Iterate on all crate pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
url = self.CRATE_URL_PATTERN.format(crate=page[0]["name"])
last_update = page[0]["last_update"]
artifacts = []
crates_metadata = []
for entry in page:
# Build an artifact entry following original-artifacts-json specification
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950
artifacts.append(
{
"version": entry["version"],
"filename": entry["filename"],
"url": entry["crate_file"],
"checksums": {
"sha256": entry["checksum"],
},
}
)
crates_metadata.append(
{
"version": entry["version"],
"yanked": entry["yanked"],
"last_update": entry["last_update"],
}
for crate_versions in page:
url = self.CRATE_URL_PATTERN.format(crate=crate_versions[0]["name"])
last_update = crate_versions[0]["last_update"]
artifacts = []
crates_metadata = []
for entry in crate_versions:
# Build an artifact entry following original-artifacts-json specification
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950
artifacts.append(
{
"version": entry["version"],
"filename": entry["filename"],
"url": entry["crate_file"],
"checksums": {
"sha256": entry["checksum"],
},
}
)
crates_metadata.append(
{
"version": entry["version"],
"yanked": entry["yanked"],
"last_update": entry["last_update"],
}
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=iso8601.parse_date(last_update),
extra_loader_arguments={
"artifacts": artifacts,
"crates_metadata": crates_metadata,
},
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=iso8601.parse_date(last_update),
extra_loader_arguments={
"artifacts": artifacts,
"crates_metadata": crates_metadata,
},
)
def finalize(self) -> None:
if not self.state.index_last_update and self.all_crates_processed:
last = iso8601.parse_date(self.index_metadata["timestamp"])
......
......@@ -175,7 +175,7 @@ def test_crates_lister(datadir, tmp_path, swh_scheduler, requests_mock_datadir):
lister = CratesLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 3
assert res.pages == 1
assert res.origins == 3
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
......@@ -205,7 +205,7 @@ def test_crates_lister_incremental(
lister = CratesLister(scheduler=swh_scheduler)
first = lister.run()
assert first.pages == 3
assert first.pages == 1
assert first.origins == 3
second = lister.run()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment