Skip to content
Snippets Groups Projects
Commit b8de6922 authored by Antoine Lambert's avatar Antoine Lambert Committed by Phabricator Migration user
Browse files

gitlab: Improve incremental listing

Incremental listing of a GitLab instance will now list repositories
modified since last listing date, previously only repositories created
since last listing date were listed.

We still benefit from GitLab keyset pagination with that extra filtering
so it seems those type of queries are well indexed in GitLab database.

This should help reducing the lag between archived GitLab repositories
and their upstream states.
parent cee6bcb5
No related branches found
Tags debian/upstream/0.0.38
No related merge requests found
......@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
import logging
import random
from typing import Any, Dict, Iterator, Optional, Tuple
......@@ -31,9 +32,8 @@ VCS_MAPPING = {"hg_git": "hg"}
class GitLabListerState:
"""State of the GitLabLister"""
last_seen_next_link: Optional[str] = None
"""Last link header (not visited yet) during an incremental pass
last_listing_date: Optional[str] = None
"""Last date when listing started during an incremental pass
"""
......@@ -117,6 +117,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
self.incremental = incremental
self.last_page: Optional[str] = None
self.per_page = 100
self.listing_date = datetime.now(timezone.utc).isoformat()
self.session = requests.Session()
self.session.headers.update(
......@@ -133,7 +134,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
self.session.headers["Authorization"] = f"Bearer {api_token}"
def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState:
return GitLabListerState(**d)
return GitLabListerState(last_listing_date=d.get("last_listing_date"))
def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
return asdict(state)
......@@ -188,14 +189,12 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
}
if id_after is not None:
parameters["id_after"] = str(id_after)
if self.incremental and self.state and self.state.last_listing_date:
parameters["last_activity_after"] = self.state.last_listing_date
return f"{self.url}/projects?{urlencode(parameters)}"
def get_pages(self) -> Iterator[PageResult]:
next_page: Optional[str]
if self.incremental and self.state and self.state.last_seen_next_link:
next_page = self.state.last_seen_next_link
else:
next_page = self.page_url()
next_page = self.page_url()
while next_page:
self.last_page = next_page
......@@ -217,49 +216,12 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
last_update=iso8601.parse_date(repo["last_activity_at"]),
)
def commit_page(self, page_result: PageResult) -> None:
"""Update currently stored state using the latest listed "next" page if relevant.
Relevancy is determined by the next_page link whose 'page' id must be strictly
superior to the currently stored one.
Note: this is a noop for full listing mode
"""
if self.incremental:
# link: https://${project-api}/?...&page=2x...
next_page = page_result.next_page
if not next_page and self.last_page:
next_page = self.last_page
if next_page:
id_after = _parse_id_after(next_page)
previous_next_page = self.state.last_seen_next_link
previous_id_after = _parse_id_after(previous_next_page)
if previous_next_page is None or (
previous_id_after and id_after and previous_id_after < id_after
):
self.state.last_seen_next_link = next_page
def finalize(self) -> None:
"""finalize the lister state when relevant (see `fn:commit_page` for details)
Note: this is a noop for full listing mode
"""
next_page = self.state.last_seen_next_link
if self.incremental and next_page:
# link: https://${project-api}/?...&page=2x...
next_id_after = _parse_id_after(next_page)
scheduler_state = self.get_state_from_scheduler()
previous_next_id_after = _parse_id_after(
scheduler_state.last_seen_next_link
)
if (not previous_next_id_after and next_id_after) or (
previous_next_id_after
and next_id_after
and previous_next_id_after < next_id_after
):
self.updated = True
if self.incremental:
self.state.last_listing_date = self.listing_date
self.updated = True
......@@ -6,7 +6,7 @@
"description": "Miscellaneous utils that are commonly used in multiple projects.",
"forks_count": 0,
"http_url_to_repo": "https://gite.lirmm.fr/yuquan/roboticsutils.git",
"id": 4456,
"id": 1,
"last_activity_at": "2021-01-14T11:32:50.672Z",
"name": "RoboticsUtils",
"name_with_namespace": "Wang Yuquan / RoboticsUtils",
......@@ -26,7 +26,7 @@
"description": "",
"forks_count": 0,
"http_url_to_repo": "https://gite.lirmm.fr/constraint-acquisition-team/pacq.git",
"id": 4444,
"id": 2,
"last_activity_at": "2020-12-15T19:43:53.678Z",
"name": "PACQ",
"name_with_namespace": "Constraint Acquisition Team / PACQ",
......@@ -39,4 +39,4 @@
"tag_list": [],
"web_url": "https://gite.lirmm.fr/constraint-acquisition-team/pacq"
}
]
]
\ No newline at end of file
......@@ -6,7 +6,7 @@
"description": "",
"forks_count": 0,
"http_url_to_repo": "https://gite.lirmm.fr/mgardeisen/citest.git",
"id": 4440,
"id": 3,
"last_activity_at": "2021-01-21T14:37:31.022Z",
"name": "CItest",
"name_with_namespace": "Marine Gardeisen / CItest",
......@@ -26,7 +26,7 @@
"description": "Can be used to enforce the conventional commits specification on a package, generate a changelog recommend the next version to release\r\nSee see https://conventionalcommits.org",
"forks_count": 0,
"http_url_to_repo": "https://gite.lirmm.fr/pid/environments/conventional_commits.git",
"id": 4428,
"id": 4,
"last_activity_at": "2021-01-08T11:11:54.178Z",
"name": "conventional_commits",
"name_with_namespace": "pid / environments / conventional_commits",
......
......@@ -6,7 +6,7 @@
"description": "PID WRapper for urdfdom and urdfdom_headers project, provided by ROS but independent from ROS.",
"forks_count": 0,
"http_url_to_repo": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom.git",
"id": 4363,
"id": 5,
"last_activity_at": "2020-11-19T08:56:18.573Z",
"name": "urdfdom",
"name_with_namespace": "rob-miscellaneous / wrappers / urdfdom",
......@@ -19,4 +19,4 @@
"tag_list": [],
"web_url": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom"
}
]
]
\ No newline at end of file
# Copyright (C) 2017-2021 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -7,6 +7,7 @@ import json
import logging
from pathlib import Path
from typing import Dict, List
from urllib.parse import quote
import pytest
from requests.status_codes import codes
......@@ -92,9 +93,9 @@ def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock):
assert listed_origin.last_update is not None
def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]:
def gitlab_page_response(datadir, instance: str, page_num: int) -> List[Dict]:
"""Return list of repositories (out of test dataset)"""
datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json")
datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_num}.json")
return json.loads(datapath.read_text()) if datapath.exists else []
......@@ -142,9 +143,9 @@ def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir):
url_page1 = lister.page_url()
response1 = gitlab_page_response(datadir, instance, 1)
url_page2 = lister.page_url(2)
url_page2 = lister.page_url(response1[-1]["id"])
response2 = gitlab_page_response(datadir, instance, 2)
url_page3 = lister.page_url(3)
url_page3 = lister.page_url(response2[-1]["id"])
response3 = gitlab_page_response(datadir, instance, 3)
requests_mock.get(
......@@ -162,11 +163,25 @@ def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir):
expected_nb_origins = len(response1) + len(response2)
assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
assert lister.state.last_seen_next_link == url_page2
assert lister.state.last_listing_date == lister.listing_date
lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True)
# Lister will start back at the last stop
url_page2 = lister2.page_url()
response2 = gitlab_page_response(datadir, instance, 2)
url_page3 = lister2.page_url(response2[-1]["id"])
response3 = gitlab_page_response(datadir, instance, 3)
# in a real world scenario, incremental lister will list repositories whose
# have been modified since last listing date
last_activity_param = (
f"last_activity_after={quote(lister2.state.last_listing_date)}"
)
assert last_activity_param in url_page2
assert last_activity_param in url_page3
requests_mock.get(
url_page2,
[{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}],
......@@ -183,7 +198,7 @@ def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir):
assert listed_result2 == ListerStats(
pages=2, origins=len(response2) + len(response3)
)
assert lister2.state.last_seen_next_link == url_page3
assert lister2.state.last_listing_date == lister2.listing_date
assert lister.lister_obj.id == lister2.lister_obj.id
scheduler_origins = lister2.scheduler.get_listed_origins(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment