From f1ae6825e56c0903b5a12f85f2c7ec980e81028d Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Tue, 1 Aug 2023 16:47:24 +0200
Subject: [PATCH] packagist: Improve extract package metadata information
 algorithm

The current lister implementation lists very few metadata with the hard-coded /p/ base
url (404 on mostly all packages). The packagist api implementation must have evolved
since the initial implementation of the lister (and the first deployment on staging).

Following the upstream documentation [1], it's sensible to first use the /p2/ as it's
performant from the packagist api side. It's then fallbacking to use /p2/+~dev url
scheme, then the /p/ scheme and finally the /packages/ base url if previous result are
either not found or empty (different than no modification since the last visit).

It keeps the initial implementation behavior of stopping immediately if a 304
NotModifiedSince is returned by the server.

[1] https://repo.packagist.org/apidoc
---
 swh/lister/packagist/lister.py            | 116 ++++++++++++++++++----
 swh/lister/packagist/tests/test_lister.py |  54 +++++++---
 2 files changed, 137 insertions(+), 33 deletions(-)

diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py
index af57b550..5f7a4a53 100644
--- a/swh/lister/packagist/lister.py
+++ b/swh/lister/packagist/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2022  The Software Heritage developers
+# Copyright (C) 2019-2023  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -21,6 +21,12 @@ logger = logging.getLogger(__name__)
 PackagistPageType = List[str]
 
 
+class NotModifiedSinceLastVisit(ValueError):
+    """Exception raised when a package has seen no change since the last visit."""
+
+    pass
+
+
 @dataclass
 class PackagistListerState:
     """State of Packagist lister"""
@@ -47,7 +53,18 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
 
     LISTER_NAME = "Packagist"
     PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json"
-    PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p"
+    PACKAGIST_PACKAGE_URL_FORMATS = [
+        # preferred, static, efficient on their side as it can be cached
+        "https://repo.packagist.org/p2/{package_name}.json",
+        # fallback from 1. ^ as it might contain development versions
+        "https://repo.packagist.org/p2/{package_name}~dev.json",
+        # composer v1 metadata, static, efficient but deprecated on their side.
+        # From previous lister version: very few results but some still exists so ok to
+        # check
+        "https://repo.packagist.org/p/{package_name}.json",
+        # dynamic, inefficient on packagist's end, should be used as a last resort
+        "https://repo.packagist.org/packages/{package_name}.json",
+    ]
 
     def __init__(
         self,
@@ -84,16 +101,85 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
             d["last_listing_date"] = last_listing_date.isoformat()
         return d
 
-    def api_request(self, url: str) -> Any:
+    def api_request(self, url: str) -> Dict:
+        """Execute api request to packagist server.
+
+        Raise:
+            NotModifiedSinceLastVisit: if the url returns a 304 response.
+
+        Return:
+            The json result in case of a 200, an empty response otherwise.
+
+        """
         response = self.http_request(url)
         # response is empty when status code is 304
-        return response.json() if response.status_code == 200 else {}
+        status_code = response.status_code
+        if status_code == 200:
+            return response.json()
+        elif status_code == 304:
+            raise NotModifiedSinceLastVisit(url)
+        else:
+            return {}
 
     def get_pages(self) -> Iterator[PackagistPageType]:
         """
         Yield a single page listing all Packagist projects.
         """
-        yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"]
+        yield self.api_request(self.url)["packageNames"]
+
+    def _get_metadata_from_page(
+        self, package_url_format: str, package_name: str
+    ) -> Optional[List[Dict]]:
+        """Retrieve metadata from a package if any.
+
+        Raise:
+            NotModifiedSinceLastVisit: if the url returns a 304 response.
+
+        Return:
+            metadata from a package if any
+
+        """
+        try:
+            metadata_url = package_url_format.format(package_name=package_name)
+            metadata = self.api_request(metadata_url)
+            packages = metadata.get("packages", {})
+            if not packages:
+                # package metadata not updated since last listing
+                return None
+            package_info = packages.get(package_name)
+            if package_info is None:
+                # missing package metadata in response
+                return None
+            return package_info.values()  # could be an empty response though -> []
+        except requests.HTTPError:
+            # error when getting package metadata (usually 404 when a package has
+            # been removed), skip it and process next package
+            return None
+
+    def _get_metadata_for_package(self, package_name: str) -> Optional[List[Dict]]:
+        """Tentatively retrieve metadata information from a package name.
+
+        This tries out in order the following pages:
+        - /p2/{package}.json: static and performant (on packagist's side) url.
+        - /p2/{package}~dev.json: static, performant for development package url.
+        - /packages/{package}.json: costly (for packagist's side) url
+
+        If nothing is found in all urls, a None result is returned.
+
+        Raise:
+            NotModifiedSinceLastVisit: if the url returns a 304 response.
+
+        Return:
+            Metadata information on the package name if any.
+
+        """
+        for package_url_format in self.PACKAGIST_PACKAGE_URL_FORMATS:
+            meta_info = self._get_metadata_from_page(package_url_format, package_name)
+            # If information, return it immediately, otherwise fallback to the next
+            if meta_info:
+                return meta_info
+
+        return None
 
     def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]:
         """
@@ -114,19 +200,13 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
 
         for package_name in page:
             try:
-                metadata = self.api_request(
-                    f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json"
-                )
-                if not metadata.get("packages", {}):
-                    # package metadata not updated since last listing
-                    continue
-                if package_name not in metadata["packages"]:
-                    # missing package metadata in response
-                    continue
-                versions_info = metadata["packages"][package_name].values()
-            except requests.HTTPError:
-                # error when getting package metadata (usually 404 when a
-                # package has been removed), skip it and process next package
+                versions_info = self._get_metadata_for_package(package_name)
+            except NotModifiedSinceLastVisit:
+                # Package was not modified server side since the last visit, we skip it
+                continue
+
+            if versions_info is None:
+                # No info on package, we skip it
                 continue
 
             origin_url = None
diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py
index 4f512a23..fa8cdfcf 100644
--- a/swh/lister/packagist/tests/test_lister.py
+++ b/swh/lister/packagist/tests/test_lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2022  The Software Heritage developers
+# Copyright (C) 2019-2023  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -43,8 +43,11 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
     for package_name in _packages_list["packageNames"]:
         metadata = _package_metadata(datadir, package_name)
         packages_metadata[package_name] = metadata
+        package_metadata_url = lister.PACKAGIST_PACKAGE_URL_FORMATS[1].format(
+            package_name=package_name
+        )
         requests_mock.get(
-            f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
+            package_metadata_url,
             json=metadata,
             additional_matcher=_request_without_if_modified_since,
         )
@@ -86,8 +89,11 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
     # has been updated since first listing
     lister = PackagistLister(scheduler=swh_scheduler)
     for package_name in _packages_list["packageNames"]:
+        package_metadata_url = lister.PACKAGIST_PACKAGE_URL_FORMATS[1].format(
+            package_name=package_name
+        )
         requests_mock.get(
-            f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
+            package_metadata_url,
             additional_matcher=_request_with_if_modified_since,
             status_code=304,
         )
@@ -110,11 +116,13 @@ def test_packagist_lister_missing_metadata(swh_scheduler, requests_mock, datadir
     lister = PackagistLister(scheduler=swh_scheduler)
     requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
     for package_name in _packages_list["packageNames"]:
-        requests_mock.get(
-            f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
-            additional_matcher=_request_without_if_modified_since,
-            status_code=404,
-        )
+        for format_url in lister.PACKAGIST_PACKAGE_URL_FORMATS:
+            url = format_url.format(package_name=package_name)
+            requests_mock.get(
+                url,
+                additional_matcher=_request_without_if_modified_since,
+                status_code=404,
+            )
 
     stats = lister.run()
 
@@ -126,11 +134,13 @@ def test_packagist_lister_empty_metadata(swh_scheduler, requests_mock, datadir):
     lister = PackagistLister(scheduler=swh_scheduler)
     requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
     for package_name in _packages_list["packageNames"]:
-        requests_mock.get(
-            f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
-            additional_matcher=_request_without_if_modified_since,
-            json={"packages": {}},
-        )
+        for format_url in lister.PACKAGIST_PACKAGE_URL_FORMATS:
+            url = format_url.format(package_name=package_name)
+            requests_mock.get(
+                url,
+                additional_matcher=_request_without_if_modified_since,
+                json={"packages": {}},
+            )
 
     stats = lister.run()
 
@@ -146,8 +156,19 @@ def test_packagist_lister_package_with_bitbucket_hg_origin(
     requests_mock.get(
         lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]}
     )
+    url_not_found = lister.PACKAGIST_PACKAGE_URL_FORMATS[0].format(
+        package_name=package_name
+    )
     requests_mock.get(
-        f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
+        url_not_found,
+        additional_matcher=_request_without_if_modified_since,
+        status_code=404,
+    )
+    url_with_results = lister.PACKAGIST_PACKAGE_URL_FORMATS[1].format(
+        package_name=package_name
+    )
+    requests_mock.get(
+        url_with_results,
         additional_matcher=_request_without_if_modified_since,
         json=_package_metadata(datadir, package_name),
     )
@@ -166,8 +187,11 @@ def test_packagist_lister_package_normalize_github_origin(
     requests_mock.get(
         lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]}
     )
+    url_with_results = lister.PACKAGIST_PACKAGE_URL_FORMATS[0].format(
+        package_name=package_name
+    )
     requests_mock.get(
-        f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
+        url_with_results,
         additional_matcher=_request_without_if_modified_since,
         json=_package_metadata(datadir, package_name),
     )
-- 
GitLab