diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 05720c94143cf58e8b3d3b95070de2d1cfedc418..d65d0c26c3b1e531968f729aac349d683c62629a 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ from typing import Any, Dict, Iterator, List, Optional from urllib import parse import iso8601 +from requests import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -118,9 +119,22 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): while True: self.url_params["after"] = last_repo_cdate - body = self.http_request(self.url, params=self.url_params).json() - - yield body["values"] + try: + body = self.http_request(self.url, params=self.url_params).json() + yield body["values"] + except HTTPError as e: + if e.response.status_code == 500: + logger.warning( + "URL %s is buggy (error 500), skip it and get next page.", + e.response.url, + ) + body = self.http_request( + self.url, + params={ + "pagelen": self.url_params["pagelen"], + "fields": "next", + }, + ).json() next_page_url = body.get("next") if next_page_url is not None: diff --git a/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json b/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json index 8c9b59dfb40964aa425f29cac0f30413a25b1c45..005bd5a5ad132c57cb3e8a8600e03128a1b8a810 100644 --- a/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json +++ b/swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json @@ -161,5 +161,5 @@ } } ], - "next": "https://api.bitbucket.org/2.0/repositories?pagelen=10&after=2011-09-03T12%3A33%3A16.028393%2B00%3A00&fields=next%2Cvalues.links.clone.href%2Cvalues.slug%2Cvalues.scm%2Cvalues.updated_on%2Cvalues.created_on" + "next": "https://api.bitbucket.org/2.0/repositories?pagelen=10&fields=next%2Cvalues.links.clone.href%2Cvalues.scm%2Cvalues.updated_on%2Cvalues.created_on&after=2011-09-03T12%3A33%3A16.028393%2B00%3A00" } \ No newline at end of file diff --git a/swh/lister/bitbucket/tests/test_lister.py b/swh/lister/bitbucket/tests/test_lister.py index 04df324033e366d7ffd11c5dc65c45ca9c6ec64d..7ca2b0bc9bfd0084de40aaa797fd9809db862b35 100644 --- a/swh/lister/bitbucket/tests/test_lister.py +++ b/swh/lister/bitbucket/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,6 +10,7 @@ import os import pytest from swh.lister.bitbucket.lister import BitbucketLister +from swh.lister.utils import MAX_NUMBER_ATTEMPTS @pytest.fixture @@ -178,3 +179,37 @@ def test_bitbucket_full_lister( ) _check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins) + + +def test_bitbucket_lister_buggy_page( + swh_scheduler, + requests_mock, + mocker, + bb_api_repositories_page1, + bb_api_repositories_page2, +): + + requests_mock.get( + BitbucketLister.API_URL, + [ + {"json": bb_api_repositories_page1, "status_code": 200}, + *[{"json": None, "status_code": 500}] * MAX_NUMBER_ATTEMPTS, + {"json": {"next": bb_api_repositories_page1["next"]}, "status_code": 200}, + {"json": bb_api_repositories_page2, "status_code": 200}, + ], + ) + + lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) + + mocker.patch.object(lister.http_request.retry, "sleep") + + stats = lister.run() + + assert stats.pages == 2 + assert stats.origins == 20 + assert len(swh_scheduler.get_listed_origins(lister.lister_obj.id).results) == 20 + + assert ( + requests_mock.request_history[MAX_NUMBER_ATTEMPTS + 2].url + == bb_api_repositories_page1["next"] + )