Skip to content
Snippets Groups Projects
Verified Commit 3a65fbb4 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

swh.lister.pypi: Use pypi's legacy html based api to list packages

The xmlrpc is marked as deprecated [1]. Even if it's not now, the
legacy api is not marked as deprecated. So moving towards this one
sounds more reasonable [2].

[1] https://warehouse.readthedocs.io/api-reference/xml-rpc/#pypi-s-xml-rpc-methods

[2] https://warehouse.readthedocs.io/api-reference/legacy/#simple-project-api

Related T422
parent 6ff3b908
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ import random
from datetime import datetime
from email.utils import parsedate
from pprint import pformat
from xmlrpc import client
import requests
import xmltodict
......@@ -35,15 +36,8 @@ class ListerXMLRPCTransport(abc.ABC):
"""Initialize client to query for result
"""
from xmlrpc import client
return client.ServerProxy(path)
def list_packages(self, client):
"""Listing method
"""
pass
def request_uri(self, _):
"""Same uri called once
......@@ -64,24 +58,22 @@ class ListerXMLRPCTransport(abc.ABC):
return False, 0
def transport_request(self, identifier):
"""Implements SWHListerBase.transport_request for HTTP using Requests.
"""Implements SWHListerBase.transport_request
"""
path = self.request_uri(identifier)
# params = self.request_params(identifier) # we cannot use this...
try:
_client = self.get_client(path)
return self.list_packages(_client)
return self.get_client(path)
except Exception as e:
raise FetchError(e)
def transport_response_to_string(self, response):
"""Implements SWHListerBase.transport_response_to_string for XMLRPC
given responses.
"""
s = pformat(self.SERVER)
s += '\n#\n' + pformat(response)
s += '\n#\n' + pformat(response) # Note: will potentially be big
return s
......@@ -216,3 +208,25 @@ class SWHListerHttpTransport(abc.ABC):
except Exception: # not xml
s += pformat(response.text)
return s
class ListerOnePageApiTransport(SWHListerHttpTransport):
"""Use the request library for retrieving a basic html page and parse
the result.
To be used in conjunction with SWHListerBase or a subclass of it.
"""
PAGE = AbstractAttribute("The server api's unique page to retrieve and "
"parse for information")
PATH_TEMPLATE = None # we do not use it
def __init__(self, api_baseurl=None):
self.session = requests.Session()
self.lister_version = __version__
def request_uri(self, _):
"""Get the full request URI given the transport_request identifier.
"""
return self.PAGE
......@@ -2,6 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import logging
from .lister_base import SWHListerBase
......@@ -17,6 +18,13 @@ class SimpleLister(SWHListerBase):
information and stores those in db
"""
@abc.abstractmethod
def list_packages(self, *args):
"""Listing packages method.
"""
pass
def ingest_data(self, identifier, checks=False):
"""Rework the base ingest_data.
Request server endpoint which gives all in one go.
......@@ -32,6 +40,7 @@ class SimpleLister(SWHListerBase):
"""
# Request (partial?) list of repositories info
response = self.safely_issue_request(identifier)
response = self.list_packages(response)
if not response:
return response, []
models_list = self.transport_response_simplified(response)
......
......@@ -2,21 +2,23 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import xmltodict
from .models import PyPiModel
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.core.lister_transports import ListerXMLRPCTransport
from swh.lister.core.lister_transports import ListerOnePageApiTransport
class PyPiLister(ListerXMLRPCTransport, SimpleLister):
class PyPiLister(ListerOnePageApiTransport, SimpleLister):
# Template path expecting an integer that represents the page id
MODEL = PyPiModel
LISTER_NAME = 'pypi'
SERVER = 'https://pypi.org/pypi'
PAGE = 'https://pypi.org/simple/'
def __init__(self, override_config=None):
ListerXMLRPCTransport.__init__(self)
ListerOnePageApiTransport .__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def task_dict(self, origin_type, origin_url, **kwargs):
......@@ -33,11 +35,13 @@ class PyPiLister(ListerXMLRPCTransport, SimpleLister):
_type, _policy, origin_url,
project_metadata_url=project_metadata_url)
def list_packages(self, client):
"""(Override) List the actual pypi origins from the api.
def list_packages(self, response):
"""(Override) List the actual pypi origins from the response.
"""
return client.list_packages()
result = xmltodict.parse(response.content)
_all = result['html']['body']['a']
return [package['#text'] for package in _all]
def _compute_urls(self, repo_name):
"""Returns a tuple (project_url, project_metadata_url)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment