Skip to content
Snippets Groups Projects
Verified Commit f1ae6825 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

packagist: Improve extract package metadata information algorithm

The current lister implementation lists very few metadata with the hard-coded /p/ base
url (404 on mostly all packages). The packagist api implementation must have evolved
since the initial implementation of the lister (and the first deployment on staging).

Following the upstream documentation [1], it's sensible to first use the /p2/ as it's
performant from the packagist api side. It's then fallbacking to use /p2/+~dev url
scheme, then the /p/ scheme and finally the /packages/ base url if previous result are
either not found or empty (different than no modification since the last visit).

It keeps the initial implementation behavior of stopping immediately if a 304
NotModifiedSince is returned by the server.

[1] https://repo.packagist.org/apidoc
parent 1f272506
No related branches found
Tags v5.9.3
1 merge request!494packagist: Improve extract package metadata information algorithm
Pipeline #3713 passed
# Copyright (C) 2019-2022 The Software Heritage developers
# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -21,6 +21,12 @@ logger = logging.getLogger(__name__)
PackagistPageType = List[str]
class NotModifiedSinceLastVisit(ValueError):
"""Exception raised when a package has seen no change since the last visit."""
pass
@dataclass
class PackagistListerState:
"""State of Packagist lister"""
......@@ -47,7 +53,18 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
LISTER_NAME = "Packagist"
PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json"
PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p"
PACKAGIST_PACKAGE_URL_FORMATS = [
# preferred, static, efficient on their side as it can be cached
"https://repo.packagist.org/p2/{package_name}.json",
# fallback from 1. ^ as it might contain development versions
"https://repo.packagist.org/p2/{package_name}~dev.json",
# composer v1 metadata, static, efficient but deprecated on their side.
# From previous lister version: very few results but some still exists so ok to
# check
"https://repo.packagist.org/p/{package_name}.json",
# dynamic, inefficient on packagist's end, should be used as a last resort
"https://repo.packagist.org/packages/{package_name}.json",
]
def __init__(
self,
......@@ -84,16 +101,85 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
d["last_listing_date"] = last_listing_date.isoformat()
return d
def api_request(self, url: str) -> Any:
def api_request(self, url: str) -> Dict:
"""Execute api request to packagist server.
Raise:
NotModifiedSinceLastVisit: if the url returns a 304 response.
Return:
The json result in case of a 200, an empty response otherwise.
"""
response = self.http_request(url)
# response is empty when status code is 304
return response.json() if response.status_code == 200 else {}
status_code = response.status_code
if status_code == 200:
return response.json()
elif status_code == 304:
raise NotModifiedSinceLastVisit(url)
else:
return {}
def get_pages(self) -> Iterator[PackagistPageType]:
"""
Yield a single page listing all Packagist projects.
"""
yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"]
yield self.api_request(self.url)["packageNames"]
def _get_metadata_from_page(
self, package_url_format: str, package_name: str
) -> Optional[List[Dict]]:
"""Retrieve metadata from a package if any.
Raise:
NotModifiedSinceLastVisit: if the url returns a 304 response.
Return:
metadata from a package if any
"""
try:
metadata_url = package_url_format.format(package_name=package_name)
metadata = self.api_request(metadata_url)
packages = metadata.get("packages", {})
if not packages:
# package metadata not updated since last listing
return None
package_info = packages.get(package_name)
if package_info is None:
# missing package metadata in response
return None
return package_info.values() # could be an empty response though -> []
except requests.HTTPError:
# error when getting package metadata (usually 404 when a package has
# been removed), skip it and process next package
return None
def _get_metadata_for_package(self, package_name: str) -> Optional[List[Dict]]:
"""Tentatively retrieve metadata information from a package name.
This tries out in order the following pages:
- /p2/{package}.json: static and performant (on packagist's side) url.
- /p2/{package}~dev.json: static, performant for development package url.
- /packages/{package}.json: costly (for packagist's side) url
If nothing is found in all urls, a None result is returned.
Raise:
NotModifiedSinceLastVisit: if the url returns a 304 response.
Return:
Metadata information on the package name if any.
"""
for package_url_format in self.PACKAGIST_PACKAGE_URL_FORMATS:
meta_info = self._get_metadata_from_page(package_url_format, package_name)
# If information, return it immediately, otherwise fallback to the next
if meta_info:
return meta_info
return None
def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]:
"""
......@@ -114,19 +200,13 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
for package_name in page:
try:
metadata = self.api_request(
f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json"
)
if not metadata.get("packages", {}):
# package metadata not updated since last listing
continue
if package_name not in metadata["packages"]:
# missing package metadata in response
continue
versions_info = metadata["packages"][package_name].values()
except requests.HTTPError:
# error when getting package metadata (usually 404 when a
# package has been removed), skip it and process next package
versions_info = self._get_metadata_for_package(package_name)
except NotModifiedSinceLastVisit:
# Package was not modified server side since the last visit, we skip it
continue
if versions_info is None:
# No info on package, we skip it
continue
origin_url = None
......
# Copyright (C) 2019-2022 The Software Heritage developers
# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -43,8 +43,11 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
for package_name in _packages_list["packageNames"]:
metadata = _package_metadata(datadir, package_name)
packages_metadata[package_name] = metadata
package_metadata_url = lister.PACKAGIST_PACKAGE_URL_FORMATS[1].format(
package_name=package_name
)
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
package_metadata_url,
json=metadata,
additional_matcher=_request_without_if_modified_since,
)
......@@ -86,8 +89,11 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
# has been updated since first listing
lister = PackagistLister(scheduler=swh_scheduler)
for package_name in _packages_list["packageNames"]:
package_metadata_url = lister.PACKAGIST_PACKAGE_URL_FORMATS[1].format(
package_name=package_name
)
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
package_metadata_url,
additional_matcher=_request_with_if_modified_since,
status_code=304,
)
......@@ -110,11 +116,13 @@ def test_packagist_lister_missing_metadata(swh_scheduler, requests_mock, datadir
lister = PackagistLister(scheduler=swh_scheduler)
requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
for package_name in _packages_list["packageNames"]:
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
additional_matcher=_request_without_if_modified_since,
status_code=404,
)
for format_url in lister.PACKAGIST_PACKAGE_URL_FORMATS:
url = format_url.format(package_name=package_name)
requests_mock.get(
url,
additional_matcher=_request_without_if_modified_since,
status_code=404,
)
stats = lister.run()
......@@ -126,11 +134,13 @@ def test_packagist_lister_empty_metadata(swh_scheduler, requests_mock, datadir):
lister = PackagistLister(scheduler=swh_scheduler)
requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
for package_name in _packages_list["packageNames"]:
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
additional_matcher=_request_without_if_modified_since,
json={"packages": {}},
)
for format_url in lister.PACKAGIST_PACKAGE_URL_FORMATS:
url = format_url.format(package_name=package_name)
requests_mock.get(
url,
additional_matcher=_request_without_if_modified_since,
json={"packages": {}},
)
stats = lister.run()
......@@ -146,8 +156,19 @@ def test_packagist_lister_package_with_bitbucket_hg_origin(
requests_mock.get(
lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]}
)
url_not_found = lister.PACKAGIST_PACKAGE_URL_FORMATS[0].format(
package_name=package_name
)
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
url_not_found,
additional_matcher=_request_without_if_modified_since,
status_code=404,
)
url_with_results = lister.PACKAGIST_PACKAGE_URL_FORMATS[1].format(
package_name=package_name
)
requests_mock.get(
url_with_results,
additional_matcher=_request_without_if_modified_since,
json=_package_metadata(datadir, package_name),
)
......@@ -166,8 +187,11 @@ def test_packagist_lister_package_normalize_github_origin(
requests_mock.get(
lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]}
)
url_with_results = lister.PACKAGIST_PACKAGE_URL_FORMATS[0].format(
package_name=package_name
)
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
url_with_results,
additional_matcher=_request_without_if_modified_since,
json=_package_metadata(datadir, package_name),
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment