Skip to content
Snippets Groups Projects
Verified Commit 847a8d34 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

swh.lister.gitlab: Add Incremental lister behavior

Related T989
parent ccd0525c
No related merge requests found
......@@ -79,7 +79,14 @@ class PageByPageLister(SWHListerBase):
# You probably don't need to override anything below this line.
def run(self, min_bound=None, max_bound=None):
def check_existence(self, injected_repos):
"""Given a list of injected repos, check if we already have them.
"""
# FIXME: Implement the check
return False
def run(self, min_bound=None, max_bound=None, check_existence=False):
"""Main entry function. Sequentially fetches repository data from the
service according to the basic outline in the class
docstring. Continually fetching sublists until either there
......@@ -89,6 +96,9 @@ class PageByPageLister(SWHListerBase):
Args:
min_bound: optional page to start from
max_bound: optional page to stop at
check_existence (bool): optional existence check (for
incremental lister whose sort
order is inverted)
Returns:
nothing
......@@ -99,6 +109,7 @@ class PageByPageLister(SWHListerBase):
self.min_page = min_bound
self.max_page = max_bound
already_seen = False
while self.is_within_bounds(page, self.min_page, self.max_page):
logging.info('listing repos starting at %s' % page)
......@@ -106,12 +117,18 @@ class PageByPageLister(SWHListerBase):
response, injected_repos = self.ingest_data(page)
next_page = self.get_next_target_from_response(response)
if check_existence:
already_seen = self.check_existence(injected_repos)
# termination condition
if (next_page is None) or (next_page == page):
logging.info('stopping after page %s, no next link found' %
page)
break
elif already_seen:
logging.info('Repositories already seen, stopping')
break
else:
page = next_page
......
......@@ -12,7 +12,7 @@ from .models import GitLabModel
class GitLabLister(PageByPageHttpLister):
# Template path expecting an integer that represents the page id
PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true'
PATH_TEMPLATE = '/projects?page=%d&order_by=id'
API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*')
MODEL = GitLabModel
LISTER_NAME = 'gitlab'
......@@ -103,7 +103,7 @@ class GitLabLister(PageByPageHttpLister):
return None
def get_pages_information(self):
"""Determine some pages information.
"""Determine pages information.
"""
response = self.transport_head(identifier=1)
......
......@@ -17,13 +17,16 @@ class GitLabListerTask(ListerTaskBase):
class RangeGitLabLister(GitLabListerTask, RangeListerTask):
"""GitLab lister working on specified range (start, end) arguments.
"""Range GitLab lister (list available origins on specified range)
"""
task_queue = 'swh_lister_gitlab_refresh'
class FullGitLabRelister(GitLabListerTask):
"""Full GitLab lister (list all available origins from the api).
"""
task_queue = 'swh_lister_gitlab_refresh'
def run_task(self, *args, **kwargs):
......@@ -41,3 +44,22 @@ class FullGitLabRelister(GitLabListerTask):
range_task = RangeGitLabLister()
group(range_task.s(minv, maxv, *args, **kwargs)
for minv, maxv in ranges)()
class IncrementalGitLabLister(ListerTaskBase):
"""Incremental GitLab lister (list only new available origins).
"""
task_queue = 'swh_lister_gitlab_discover'
def new_lister(self, api_baseurl='https://gitlab.com/api/v4',
instance='gitlab.com',):
# will invert the order of the lister's result
return GitLabLister(instance=instance, api_baseurl=api_baseurl,
sort='desc')
def run_task(self, *args, **kwargs):
lister = self.new_lister(*args, **kwargs)
# will check for existing data and exit when found
return lister.run(min_bound=None, max_bound=None,
check_existence=True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment