Skip to content
Snippets Groups Projects
Unverified Commit d2390dc4 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

loader.pypi: Store pypi origin's releases in storage

Related T421
parent 6a71db66
No related branches found
No related tags found
No related merge requests found
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version # License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information # See top-level LICENSE file for more information
import arrow
import hashlib import hashlib
import logging import logging
import os import os
...@@ -14,8 +15,10 @@ from swh.loader.core.utils import clean_dangling_folders ...@@ -14,8 +15,10 @@ from swh.loader.core.utils import clean_dangling_folders
from swh.loader.core.loader import SWHStatelessLoader from swh.loader.core.loader import SWHStatelessLoader
from swh.model import hashutil from swh.model import hashutil
from swh.model.from_disk import Directory from swh.model.from_disk import Directory
from swh.model.identifiers import (release_identifier, revision_identifier, from swh.model.identifiers import (
snapshot_identifier, identifier_to_bytes) release_identifier, revision_identifier, snapshot_identifier,
identifier_to_bytes, normalize_timestamp
)
from .model import PyPiProject from .model import PyPiProject
...@@ -89,8 +92,7 @@ class PyPiClient: ...@@ -89,8 +92,7 @@ class PyPiClient:
""" """
import gzip import gzip
from json import dumps from json import dumps
from arrow import utcnow datepath = arrow.utcnow().isoformat()
datepath = utcnow().isoformat()
fname = os.path.join(self.cache_dir, datepath + '.gz') fname = os.path.join(self.cache_dir, datepath + '.gz')
with gzip.open(fname, 'w') as f: with gzip.open(fname, 'w') as f:
f.write(bytes( f.write(bytes(
...@@ -115,13 +117,18 @@ class PyPiClient: ...@@ -115,13 +117,18 @@ class PyPiClient:
"""Given a dictionary of releases, retrieve them locally. """Given a dictionary of releases, retrieve them locally.
""" """
_releases = releases.copy() # order the release in time order
for version, release in releases.items(): _release_versions = list(releases.keys())
logging.debug('version: %s' % version) _release_versions.sort()
for version in _release_versions:
release = releases[version]
_release = release.copy()
logging.debug('Release version: %s' % version)
path = os.path.join(self.temp_directory, project, version) path = os.path.join(self.temp_directory, project, version)
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
filepath = os.path.join(path, release['filename']) filepath = os.path.join(path, release['filename'])
logging.debug('filepath to write: %s' % filepath) logging.debug('Release local path: %s' % filepath)
r = self.session.get(release['url']) r = self.session.get(release['url'])
if not r.ok: if not r.ok:
...@@ -150,14 +157,14 @@ class PyPiClient: ...@@ -150,14 +157,14 @@ class PyPiClient:
os.makedirs(uncompress_path, exist_ok=True) os.makedirs(uncompress_path, exist_ok=True)
nature = tarball.uncompress(filepath, uncompress_path) nature = tarball.uncompress(filepath, uncompress_path)
_releases[version]['directory'] = uncompress_path _release['directory'] = uncompress_path
artifact = convert_to_hex(hashutil.hash_path(filepath)) artifact = convert_to_hex(hashutil.hash_path(filepath))
artifact['archive_type'] = nature artifact['archive_type'] = nature
for key, value in artifact.items(): for key, value in artifact.items():
_releases[version][key] = value _release[key] = value
return _releases yield version, _release
class PyPiLoader(SWHStatelessLoader): class PyPiLoader(SWHStatelessLoader):
...@@ -222,22 +229,27 @@ class PyPiLoader(SWHStatelessLoader): ...@@ -222,22 +229,27 @@ class PyPiLoader(SWHStatelessLoader):
self.origin_metadata_url = origin_metadata_url self.origin_metadata_url = origin_metadata_url
def get_contents(self): def get_contents(self):
return self.contents return self._contents
def get_directories(self): def get_directories(self):
return self.directories() return self._directories
def get_revisions(self): def get_revisions(self):
return self.revisions return self._revisions
def get_releases(self): def get_releases(self):
return self.releases return self._releases
def get_snapshot(self): def get_snapshot(self):
return self.snapshot return self._snapshot
def fetch_data(self): def fetch_data(self):
"""(override) Retrieve the pypi origin's information """(override) Compute pypi data:
- 1. Retrieve project information
- 2. Fetch the releases and uncompress them
- 3. Collection object information (contents, directories,
revisions, releases, snapshot)
""" """
project_info = self.pypi_client.info(self.origin_metadata_url) project_info = self.pypi_client.info(self.origin_metadata_url)
...@@ -255,16 +267,22 @@ class PyPiLoader(SWHStatelessLoader): ...@@ -255,16 +267,22 @@ class PyPiLoader(SWHStatelessLoader):
'branches': {} 'branches': {}
} }
# for each _last_rev = None
for version, release in releases.items():
for version, release in releases:
_dir_path = release.pop('directory') _dir_path = release.pop('directory')
directory = Directory.from_disk(path=_dir_path.encode('utf-8'), _dir_path = _dir_path.encode('utf-8')
save_path=True) directory = Directory.from_disk(path=_dir_path, data=True)
_objects = directory.collect() _objects = directory.collect()
_contents.append(_objects['content'].values()) _contents.extend(_objects['content'].values())
_directories.append(_objects['directory'].values()) _directories.extend(_objects['directory'].values())
date = normalize_timestamp(
int(arrow.get(release['date']).timestamp))
name = release['name'].encode('utf-8')
message = release['message'].encode('utf-8')
_revision = { _revision = {
'synthetic': True, 'synthetic': True,
'metadata': { 'metadata': {
...@@ -272,44 +290,43 @@ class PyPiLoader(SWHStatelessLoader): ...@@ -272,44 +290,43 @@ class PyPiLoader(SWHStatelessLoader):
'project': info, 'project': info,
}, },
'author': author, 'author': author,
'date': release['date'], 'date': date,
'committer': author, 'committer': author,
'committer_date': release['date'], 'committer_date': date,
'name': release['name'], 'name': name,
'message': release['message'], 'message': message,
'directory': directory.hash, 'directory': directory.hash,
'parents': [], 'parents': [] if _last_rev is None else [_last_rev['id']],
'type': 'tar', 'type': 'tar',
} }
_revision['id'] = identifier_to_bytes( _revision['id'] = identifier_to_bytes(
revision_identifier(_revision)) revision_identifier(_revision))
_revisions.append(_revision) _revisions.append(_revision)
_last_rev = _revision
_release = { _release = {
'name': release['name'], 'name': name,
'author': author, 'author': author,
'date': release['date'], 'date': date,
'message': release['message'], 'message': message,
'target_type': 'revision', 'target_type': 'revision',
'target': _revision['id'], 'target': _revision['id'],
'synthetic': False,
} }
_release['id'] = identifier_to_bytes( _release['id'] = identifier_to_bytes(
release_identifier(_release)) release_identifier(_release))
_releases.append(_release) _releases.append(_release)
_snapshot['branches'][release['name']] = { _snapshot['branches'][name] = {
'target': _release['id'], 'target': _release['id'],
'target_type': 'release', 'target_type': 'release',
} }
logging.debug('version: %s' % version)
logging.debug('release: %s' % release['directory'])
_snapshot['id'] = identifier_to_bytes( _snapshot['id'] = identifier_to_bytes(
snapshot_identifier(_snapshot)) snapshot_identifier(_snapshot))
self.contents = _contents self._contents = _contents
self.directories = _directories self._directories = _directories
self.revisions = _revisions self._revisions = _revisions
self.releases = _releases self._releases = _releases
self.snapshot = _snapshot self._snapshot = _snapshot
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
# See top-level LICENSE file for more information # See top-level LICENSE file for more information
import logging
class PyPiProject: class PyPiProject:
"""PyPi project representation """PyPi project representation
...@@ -23,15 +26,20 @@ class PyPiProject: ...@@ -23,15 +26,20 @@ class PyPiProject:
} }
def author(self): def author(self):
name = self.data['info']['author'].encode('utf-8')
email = self.data['info']['author_email'].encode('utf-8')
return { return {
'fullname': self.data['info']['author'], 'fullname': name,
'name': self.data['info']['author'], 'name': name,
'email': self.data['info']['author_email'] 'email': email,
} }
def releases(self): def releases(self):
releases = {} releases = {}
for version, release in self.data['releases'].items(): for version, release in self.data['releases'].items():
logging.debug('version: %s, release: %s' % (version, release))
# FIXME: there can be multiple 'package_type' here:
# sdist, bdist_egg, bdist_wheel
if isinstance(release, list): if isinstance(release, list):
if len(release) > 1: if len(release) > 1:
raise ValueError( # unexpected so fail so that we raise ValueError( # unexpected so fail so that we
......
...@@ -38,10 +38,12 @@ class ModelTest(TestCase): ...@@ -38,10 +38,12 @@ class ModelTest(TestCase):
def author(self): def author(self):
actual_author = self.project.author() actual_author = self.project.author()
name = self.data['info']['author'].encode('utf-8')
email = self.data['info']['author_email'].encode('utf-8')
expected_author = { expected_author = {
'fullname': self.data['info']['author'], 'fullname': name,
'name': self.data['info']['author'], 'name': name,
'email': self.data['info']['author_email'], 'email': email,
} }
self.assertEqual(expected_author, actual_author) self.assertEqual(expected_author, actual_author)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment