From 5d0f35aa690d67d73a13fb5c6e57f2f39ac82524 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Thu, 9 Mar 2023 14:26:29 +0100 Subject: [PATCH] bitbucket: Skip buggy page when listing Some URLs of the repositories endpoint from BitBucket REST API 2.0 can return an error 500. In that case, skip the buggy repositories page and get next one to continue listing and avoid to end it prematurely. Related to #4239 --- swh/lister/bitbucket/lister.py | 22 +++++++++-- .../tests/data/bb_api_repositories_page1.json | 2 +- swh/lister/bitbucket/tests/test_lister.py | 37 ++++++++++++++++++- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 05720c94..d65d0c26 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ from typing import Any, Dict, Iterator, List, Optional from urllib import parse import iso8601 +from requests import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -118,9 +119,22 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): while True: self.url_params["after"] = last_repo_cdate - body = self.http_request(self.url, params=self.url_params).json() - - yield body["values"] + try: + body = self.http_request(self.url, params=self.url_params).json() + yield body["values"] + except HTTPError as e: + if e.response.status_code == 500: + logger.warning( + "URL %s is buggy (error 500), skip it and get next page.", + e.response.url, + ) + body = self.http_request( + self.url, + params={ + "pagelen": self.url_params["pagelen"], + "fields": "next", + }, + ).json() next_page_url = body.get("next") if next_page_url is not None: diff --git a/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json b/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json index 8c9b59df..005bd5a5 100644 --- a/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json +++ b/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json @@ -161,5 +161,5 @@ } } ], - "next": "https://api.bitbucket.org/2.0/repositories?pagelen=10&after=2011-09-03T12%3A33%3A16.028393%2B00%3A00&fields=next%2Cvalues.links.clone.href%2Cvalues.slug%2Cvalues.scm%2Cvalues.updated_on%2Cvalues.created_on" + "next": "https://api.bitbucket.org/2.0/repositories?pagelen=10&fields=next%2Cvalues.links.clone.href%2Cvalues.scm%2Cvalues.updated_on%2Cvalues.created_on&after=2011-09-03T12%3A33%3A16.028393%2B00%3A00" } \ No newline at end of file diff --git a/swh/lister/bitbucket/tests/test_lister.py b/swh/lister/bitbucket/tests/test_lister.py index 04df3240..7ca2b0bc 100644 --- a/swh/lister/bitbucket/tests/test_lister.py +++ b/swh/lister/bitbucket/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,6 +10,7 @@ import os import pytest from swh.lister.bitbucket.lister import BitbucketLister +from swh.lister.utils import MAX_NUMBER_ATTEMPTS @pytest.fixture @@ -178,3 +179,37 @@ def test_bitbucket_full_lister( ) _check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins) + + +def test_bitbucket_lister_buggy_page( + swh_scheduler, + requests_mock, + mocker, + bb_api_repositories_page1, + bb_api_repositories_page2, +): + + requests_mock.get( + BitbucketLister.API_URL, + [ + {"json": bb_api_repositories_page1, "status_code": 200}, + *[{"json": None, "status_code": 500}] * MAX_NUMBER_ATTEMPTS, + {"json": {"next": bb_api_repositories_page1["next"]}, "status_code": 200}, + {"json": bb_api_repositories_page2, "status_code": 200}, + ], + ) + + lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) + + mocker.patch.object(lister.http_request.retry, "sleep") + + stats = lister.run() + + assert stats.pages == 2 + assert stats.origins == 20 + assert len(swh_scheduler.get_listed_origins(lister.lister_obj.id).results) == 20 + + assert ( + requests_mock.request_history[MAX_NUMBER_ATTEMPTS + 2].url + == bb_api_repositories_page1["next"] + ) -- GitLab