diff --git a/swh/loader/pypi/client.py b/swh/loader/pypi/client.py new file mode 100644 index 0000000000000000000000000000000000000000..ad1e74883aa807d6f9f470b16d32ae21bd87de22 --- /dev/null +++ b/swh/loader/pypi/client.py @@ -0,0 +1,211 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import arrow +import hashlib +import logging +import os +import requests + +from swh.core import tarball +from swh.model import hashutil + +try: + from swh.loader.pypi._version import __version__ +except ImportError: + __version__ = 'devel' + + +def convert_to_hex(d): + """Convert a flat dictionary with bytes in values to the same dictionary + with hex as values. + + Args: + dict: flat dictionary with sha bytes in their values. + + Returns: + Mirror dictionary with values as string hex. + + """ + if not d: + return d + + checksums = {} + for key, h in d.items(): + if isinstance(h, bytes): + checksums[key] = hashutil.hash_to_hex(h) + else: + checksums[key] = h + + return checksums + + +class PyPiClient: + """PyPi client in charge of discussing with the pypi server. + + """ + def __init__(self, temp_directory=None, cache=False, cache_dir=None): + self.version = __version__ + if not temp_directory: + from tempfile import mkdtemp + self.temp_directory = mkdtemp(dir=temp_directory, + prefix='swh.loader.pypi.client') + else: + self.temp_directory = temp_directory + + self.do_cache = cache + if self.do_cache: + self.cache_dir = cache_dir + os.makedirs(self.cache_dir, exist_ok=True) + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( + __version__ + ) + } + } + + def _save_response(self, response): + """Log the response from a server request to a cache dir. + + Args: + response: full server response + cache_dir: system path for cache dir + Returns: + nothing + """ + import gzip + from json import dumps + datepath = arrow.utcnow().isoformat() + fname = os.path.join(self.cache_dir, datepath + '.gz') + with gzip.open(fname, 'w') as f: + f.write(bytes( + dumps(response.json()), + 'utf-8' + )) + + def _get(self, url): + """Get query to the url. + + """ + response = self.session.get(url, **self.params) + if response.status_code != 200: + raise ValueError('Error during query request %s' % self.origin_url) + + if self.do_cache: + self._save_response(response) + + return response.json() + + def info(self, project_url): + """Given a metadata project url, retrieve the raw json response + + """ + return self._get(project_url) + + def release(self, project, release): + """Given a project and a release name, retrieve the raw json response + + """ + release_url = 'https://pypi.org/pypi/%s/%s/json' % (project, release) + return self._get(release_url) + + def fetch_release(self, project, release): + version = release['name'] + logging.debug('Release version: %s' % version) + path = os.path.join(self.temp_directory, project, version) + os.makedirs(path, exist_ok=True) + filepath = os.path.join(path, release['filename']) + logging.debug('Release local path: %s' % filepath) + + r = self.session.get(release['url'], **self.params) + if not r.ok: + raise ValueError('Fail to retrieve release %s' % version) + + # checks + _len = len(r.content) + if _len != release['size']: + raise ValueError('Error when checking size: %s != %s' % ( + release['size'], _len)) + + # checking digest and writing + h = hashlib.sha256() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(): + h.update(chunk) + f.write(chunk) + + actual_digest = h.hexdigest() + if actual_digest != release['sha256']: + raise ValueError( + 'Error when checking the hash checksum: %s != %s' % ( + release['sha256'], actual_digest)) + + uncompress_path = os.path.join(path, 'uncompress') + os.makedirs(uncompress_path, exist_ok=True) + + nature = tarball.uncompress(filepath, uncompress_path) + release['directory'] = uncompress_path + + artifact = convert_to_hex(hashutil.hash_path(filepath)) + artifact['archive_type'] = nature + for key, value in artifact.items(): + release[key] = value + + return release + + def retrieve_releases(self, project, releases): + """Given a dictionary of releases, retrieve them locally. + + """ + # order the release in time order + _release_versions = list(releases.keys()) + _release_versions.sort() + + for version in _release_versions: + release = releases[version] + _release = release.copy() + logging.debug('Release version: %s' % version) + path = os.path.join(self.temp_directory, project, version) + os.makedirs(path, exist_ok=True) + filepath = os.path.join(path, release['filename']) + logging.debug('Release local path: %s' % filepath) + + r = self.session.get(release['url']) + if not r.ok: + raise ValueError('Fail to retrieve release %s' % version) + + # checks + _len = len(r.content) + if _len != release['size']: + raise ValueError('Error when checking size: %s != %s' % ( + release['size'], _len)) + + # checking digest and writing + h = hashlib.sha256() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(): + h.update(chunk) + f.write(chunk) + + actual_digest = h.hexdigest() + if actual_digest != release['sha256']: + raise ValueError( + 'Error when checking the hash checksum: %s != %s' % ( + release['sha256'], actual_digest)) + + uncompress_path = os.path.join(path, 'uncompress') + os.makedirs(uncompress_path, exist_ok=True) + + nature = tarball.uncompress(filepath, uncompress_path) + _release['directory'] = uncompress_path + + artifact = convert_to_hex(hashutil.hash_path(filepath)) + artifact['archive_type'] = nature + for key, value in artifact.items(): + _release[key] = value + + yield version, _release diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index 0824234cd603f5b9389ba546d481387ab87905ff..ba97c4761ae216aebb6e3a5bbf4d6da34e393dca 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -4,169 +4,25 @@ # See top-level LICENSE file for more information import arrow -import hashlib import logging import os -import requests import shutil -from swh.core import tarball from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import SWHStatelessLoader -from swh.model import hashutil from swh.model.from_disk import Directory from swh.model.identifiers import ( release_identifier, revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) +from .client import PyPiClient from .model import PyPiProject -try: - from swh.loader.pypi._version import __version__ -except ImportError: - __version__ = 'devel' - TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' -def convert_to_hex(d): - """Convert a flat dictionary with bytes in values to the same dictionary - with hex as values. - - Args: - dict: flat dictionary with sha bytes in their values. - - Returns: - Mirror dictionary with values as string hex. - - """ - if not d: - return d - - checksums = {} - for key, h in d.items(): - if isinstance(h, bytes): - checksums[key] = hashutil.hash_to_hex(h) - else: - checksums[key] = h - - return checksums - - -class PyPiClient: - """PyPi client in charge of discussing with the pypi server. - - """ - def __init__(self, temp_directory=None, cache=False, cache_dir=None): - self.version = __version__ - if not temp_directory: - from tempfile import mkdtemp - self.temp_directory = mkdtemp(dir=temp_directory, - prefix='swh.loader.pypi.client') - else: - self.temp_directory = temp_directory - - self.do_cache = cache - if self.do_cache: - self.cache_dir = cache_dir - os.makedirs(self.cache_dir, exist_ok=True) - self.session = requests.session() - self.params = { - 'headers': { - 'User-Agent': 'Software Heritage PyPi Loader (%s)' % ( - __version__ - ) - } - } - - def _save_response(self, response): - """Log the response from a server request to a cache dir. - - Args: - response: full server response - cache_dir: system path for cache dir - Returns: - nothing - """ - import gzip - from json import dumps - datepath = arrow.utcnow().isoformat() - fname = os.path.join(self.cache_dir, datepath + '.gz') - with gzip.open(fname, 'w') as f: - f.write(bytes( - dumps(response.json()), - 'UTF-8' - )) - - def info(self, project_url): - """Given a metadata project url, retrieve the raw json response - - """ - response = self.session.get(project_url, **self.params) - if response.status_code != 200: - raise ValueError('Fail to load origin %s' % self.origin_url) - - if self.do_cache: - self._save_response(response) - - return response.json() - - def retrieve_releases(self, project, releases): - """Given a dictionary of releases, retrieve them locally. - - """ - # order the release in time order - _release_versions = list(releases.keys()) - _release_versions.sort() - - for version in _release_versions: - release = releases[version] - _release = release.copy() - logging.debug('Release version: %s' % version) - path = os.path.join(self.temp_directory, project, version) - os.makedirs(path, exist_ok=True) - filepath = os.path.join(path, release['filename']) - logging.debug('Release local path: %s' % filepath) - - r = self.session.get(release['url']) - if not r.ok: - raise ValueError('Fail to retrieve release %s' % version) - - # checks - _len = len(r.content) - if _len != release['size']: - raise ValueError('Error when checking size: %s != %s' % ( - release['size'], _len)) - - # checking digest and writing - h = hashlib.sha256() - with open(filepath, 'wb') as f: - for chunk in r.iter_content(): - h.update(chunk) - f.write(chunk) - - actual_digest = h.hexdigest() - if actual_digest != release['sha256']: - raise ValueError( - 'Error when checking the hash checksum: %s != %s' % ( - release['sha256'], actual_digest)) - - uncompress_path = os.path.join(path, 'uncompress') - os.makedirs(uncompress_path, exist_ok=True) - - nature = tarball.uncompress(filepath, uncompress_path) - _release['directory'] = uncompress_path - - artifact = convert_to_hex(hashutil.hash_path(filepath)) - artifact['archive_type'] = nature - for key, value in artifact.items(): - _release[key] = value - - yield version, _release - - class PyPiLoader(SWHStatelessLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { @@ -227,6 +83,8 @@ class PyPiLoader(SWHStatelessLoader): self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url + self.project = PyPiProject(self.pypi_client, self.project_name, + self.origin_metadata_url) def get_contents(self): return self._contents @@ -252,12 +110,7 @@ class PyPiLoader(SWHStatelessLoader): revisions, releases, snapshot) """ - project_info = self.pypi_client.info(self.origin_metadata_url) - project = PyPiProject(project_info) - releases = self.pypi_client.retrieve_releases( - self.project_name, project.releases()) - info = project.info() - author = project.author() + pypi_releases = self.project.releases() _contents = [] _directories = [] @@ -269,7 +122,11 @@ class PyPiLoader(SWHStatelessLoader): _last_rev = None - for version, release in releases: + for version, _release in pypi_releases: + info = self.project.info(version) + author = self.project.author(version) + logging.debug('author: %s' % author) + release = _release['release'] _dir_path = release.pop('directory') _dir_path = _dir_path.encode('utf-8') directory = Directory.from_disk(path=_dir_path, data=True) @@ -325,6 +182,11 @@ class PyPiLoader(SWHStatelessLoader): _snapshot['id'] = identifier_to_bytes( snapshot_identifier(_snapshot)) + logging.debug('contents: %s' % len(_contents)) + logging.debug('directories: %s' % len(_directories)) + logging.debug('revisions: %s' % len(_revisions)) + logging.debug('releases: %s' % len(_releases)) + self._contents = _contents self._directories = _directories self._revisions = _revisions diff --git a/swh/loader/pypi/model.py b/swh/loader/pypi/model.py index 89213cdbb061b4fd0fed56ff333f391bf20161ee..662ca5fdb4cfb36b219d4e8b072a11a5bf3344cf 100644 --- a/swh/loader/pypi/model.py +++ b/swh/loader/pypi/model.py @@ -4,49 +4,105 @@ # See top-level LICENSE file for more information -import logging +def info(data): + """Given a dict of data, returns a project subset. + + """ + info = data['info'] + return { + 'home_page': info['home_page'], + 'description': info['description'], + 'summary': info['summary'], + 'license': info['license'], + 'package_url': info['package_url'], + 'project_url': info['project_url'], + 'upstream': info['project_urls']['Homepage'], + } + + +def author(data): + """Given a dict of data, returns an author subset. + + """ + name = data['info']['author'] + email = data['info']['author_email'] + if email: + fullname = '%s <%s>' % (name, email) + else: + fullname = name + + return { + 'fullname': fullname.encode('utf-8'), + 'name': name.encode('utf-8'), + 'email': email.encode('utf-8'), + } class PyPiProject: """PyPi project representation + This permits to extract information for the: + - project, either the latest information (from the last revision) + - project information for a given release + - same for author information + """ - def __init__(self, data): - self.data = data - - def info(self): - return { - 'home_page': self.data['info']['home_page'], - 'description': self.data['info']['description'], - 'summary': self.data['info']['summary'], - 'license': self.data['info']['license'], - 'package_url': self.data['info']['package_url'], - 'project_url': self.data['info']['project_url'], - 'upstream': self.data['info']['project_urls']['Homepage'], - } + def __init__(self, client, project, project_metadata_url, data=None): + self.client = client + self.project = project + self.project_metadata_url = project_metadata_url + if data: + self.data = data + else: + self.data = client.info(project_metadata_url) - def author(self): - name = self.data['info']['author'].encode('utf-8') - email = self.data['info']['author_email'].encode('utf-8') - return { - 'fullname': name, - 'name': name, - 'email': email, + self.last_version = self.data['info']['version'] + self.cache = { + self.last_version: self.data } + def _data(self, release_name=None): + if release_name: + data = self.cache.get(release_name) + if not data: + data = self.client.release(self.project, release_name) + self.cache[release_name] = data + else: + data = self.data + return data + + def info(self, release_name=None): + return info(self._data(release_name)) + + def author(self, release_name=None): + return author(self._data(release_name)) + + def fetch_release(self, release_name=None): + pass + def releases(self): - releases = {} - for version, release in self.data['releases'].items(): - logging.debug('version: %s, release: %s' % (version, release)) + # sort releases in ascending order + releases_dict = self.data['releases'] + _releases = list(releases_dict.keys()) + _releases.sort() + # The compute information per release + for version in _releases: + release = releases_dict[version] + if version == self.last_version: # avoid an extra query + release_info = self.info() + else: + release_info = self.info(release_name=version) + # FIXME: there can be multiple 'package_type' here: # sdist, bdist_egg, bdist_wheel if isinstance(release, list): if len(release) > 1: - raise ValueError( # unexpected so fail so that we - # can fix later - 'Unexpected list of more than 1 element, failing!') - release = release[0] - releases[version] = { + raise ValueError( + 'Unsupported other formats for now, failing!') + + release = release[0] + # flatten the metadata to ease reading + _flattenned_release = { 'name': version, 'message': release['comment_text'], 'sha256': release['digests']['sha256'], @@ -55,4 +111,10 @@ class PyPiProject: 'url': release['url'], 'date': release['upload_time'], } - return releases + # fetch and write locally archives + _release = self.client.fetch_release(version, _flattenned_release) + + yield version, { + 'info': release_info, + 'release': _release, + } diff --git a/swh/loader/pypi/tests/test_model.py b/swh/loader/pypi/tests/test_model.py index 4f143a0853d4a7ae1307515a744922120dc11ae2..5ce57593ab1489290ae54f92e16c200a2d5c95e7 100644 --- a/swh/loader/pypi/tests/test_model.py +++ b/swh/loader/pypi/tests/test_model.py @@ -11,12 +11,52 @@ from nose.tools import istest from swh.loader.pypi.model import PyPiProject -class ModelTest(TestCase): +class MockPyPiClient: + def release(self, project, release): + """Simulate the computation of the release object. + + The production code will trigger a query to the pypi api. + + """ + if release == '0.1': + return { + 'info': { + 'home_page': 'something', + 'description': 'awesome python package', + 'summary': 'awesome python package', + 'license': '', + 'package_url': '', + 'project_url': '', + 'project_urls': { + 'Homepage': '' + }, + }, + 'releases': { + + }, + } + + def fetch_release(self, project, release): + """Avoid fetching and writing to disk, simply returns the release + object + + The production code will trigger the raw archive fetch and + writes to temporary disk, we avoid this here. + + """ + return release + +class ModelTest(TestCase): def setUp(self): with open('./swh/loader/pypi/tests/test_model_data.json') as f: self.data = json.load(f) - self.project = PyPiProject(self.data) + + self.project = PyPiProject( + client=MockPyPiClient(), + project='7xydothis', + project_metadata_url='https://pypi.org/pypi/7xydothis/json', + data=self.data) @istest def info(self): @@ -41,7 +81,7 @@ class ModelTest(TestCase): name = self.data['info']['author'].encode('utf-8') email = self.data['info']['author_email'].encode('utf-8') expected_author = { - 'fullname': name, + 'fullname': b'%s <%s>' % (name, email), 'name': name, 'email': email, } @@ -55,28 +95,43 @@ class ModelTest(TestCase): release0 = self.data['releases']['0.1'][0] release1 = self.data['releases']['0.1.1'][0] self.maxDiff = None - expected_releases = { - '0.1': { - 'name': '0.1', - 'message': release0['comment_text'], - 'sha256': release0['digests']['sha256'], - 'size': release0['size'], - 'filename': release0['filename'], - 'url': release0['url'], - 'date': release0['upload_time'], - }, - '0.1.1': { - 'name': '0.1.1', - 'message': release1['comment_text'], - 'sha256': release1['digests']['sha256'], - 'size': release1['size'], - 'filename': release1['filename'], - 'url': release1['url'], - 'date': release1['upload_time'], - } - } - - self.assertEqual(expected_releases, actual_releases) + release_011_info = self.project.info() + expected_releases = [ + ('0.1', { + 'info': { + 'home_page': 'something', + 'description': 'awesome python package', + 'summary': 'awesome python package', + 'license': '', + 'package_url': '', + 'project_url': '', + 'upstream': '', + }, + 'release': { + 'name': '0.1', + 'message': release0['comment_text'], + 'sha256': release0['digests']['sha256'], + 'size': release0['size'], + 'filename': release0['filename'], + 'url': release0['url'], + 'date': release0['upload_time'], + } + }), + ('0.1.1', { + 'info': release_011_info, + 'release': { + 'name': '0.1.1', + 'message': release1['comment_text'], + 'sha256': release1['digests']['sha256'], + 'size': release1['size'], + 'filename': release1['filename'], + 'url': release1['url'], + 'date': release1['upload_time'], + } + }) + ] + + self.assertEqual(expected_releases, list(actual_releases)) @istest def releases_unexpected_release_format(self): @@ -84,5 +139,5 @@ class ModelTest(TestCase): data['releases']['0.1'].append({'anything': 'really to break'}) with self.assertRaisesRegex(ValueError, - 'Unexpected list of more than 1'): - self.project.releases() + 'Unsupported other formats for now'): + list(self.project.releases())