Skip to content
Snippets Groups Projects
Unverified Commit 6a71db66 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

swh.loader.pypi.loader: Bootstrap pypi loader

Related T421
parent 94d353b8
No related branches found
No related tags found
No related merge requests found
......@@ -5,7 +5,9 @@ Priority: optional
Build-Depends: debhelper (>= 9),
dh-python (>= 2),
python3-all,
python3-arrow,
python3-nose,
python3-requests,
python3-setuptools,
python3-swh.core,
python3-swh.storage,
......
......@@ -35,6 +35,6 @@ setup(
install_requires=parse_requirements() + parse_requirements('swh'),
test_requires=parse_requirements('test'),
setup_requires=['vcversioner'],
vcversioner={},
vcversioner={'version_module_paths': ['swh/loader/pypi/_version.py']},
include_package_data=True,
)
__path__ = __import__('pkgutil').extend_path(__path__, __name__)
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
import logging
import os
import requests
import shutil
from swh.core import tarball
from swh.loader.core.utils import clean_dangling_folders
from swh.loader.core.loader import SWHStatelessLoader
from swh.model import hashutil
from swh.model.from_disk import Directory
from swh.model.identifiers import (release_identifier, revision_identifier,
snapshot_identifier, identifier_to_bytes)
from .model import PyPiProject
try:
from swh.loader.pypi._version import __version__
except ImportError:
__version__ = 'devel'
TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.'
def convert_to_hex(d):
"""Convert a flat dictionary with bytes in values to the same dictionary
with hex as values.
Args:
dict: flat dictionary with sha bytes in their values.
Returns:
Mirror dictionary with values as string hex.
"""
if not d:
return d
checksums = {}
for key, h in d.items():
if isinstance(h, bytes):
checksums[key] = hashutil.hash_to_hex(h)
else:
checksums[key] = h
return checksums
class PyPiClient:
"""PyPi client in charge of discussing with the pypi server.
"""
def __init__(self, temp_directory=None, cache=False, cache_dir=None):
self.version = __version__
if not temp_directory:
from tempfile import mkdtemp
self.temp_directory = mkdtemp(dir=temp_directory,
prefix='swh.loader.pypi.client')
else:
self.temp_directory = temp_directory
self.do_cache = cache
if self.do_cache:
self.cache_dir = cache_dir
os.makedirs(self.cache_dir, exist_ok=True)
self.session = requests.session()
self.params = {
'headers': {
'User-Agent': 'Software Heritage PyPi Loader (%s)' % (
__version__
)
}
}
def _save_response(self, response):
"""Log the response from a server request to a cache dir.
Args:
response: full server response
cache_dir: system path for cache dir
Returns:
nothing
"""
import gzip
from json import dumps
from arrow import utcnow
datepath = utcnow().isoformat()
fname = os.path.join(self.cache_dir, datepath + '.gz')
with gzip.open(fname, 'w') as f:
f.write(bytes(
dumps(response.json()),
'UTF-8'
))
def info(self, project_url):
"""Given a metadata project url, retrieve the raw json response
"""
response = self.session.get(project_url, **self.params)
if response.status_code != 200:
raise ValueError('Fail to load origin %s' % self.origin_url)
if self.do_cache:
self._save_response(response)
return response.json()
def retrieve_releases(self, project, releases):
"""Given a dictionary of releases, retrieve them locally.
"""
_releases = releases.copy()
for version, release in releases.items():
logging.debug('version: %s' % version)
path = os.path.join(self.temp_directory, project, version)
os.makedirs(path, exist_ok=True)
filepath = os.path.join(path, release['filename'])
logging.debug('filepath to write: %s' % filepath)
r = self.session.get(release['url'])
if not r.ok:
raise ValueError('Fail to retrieve release %s' % version)
# checks
_len = len(r.content)
if _len != release['size']:
raise ValueError('Error when checking size: %s != %s' % (
release['size'], _len))
# checking digest and writing
h = hashlib.sha256()
with open(filepath, 'wb') as f:
for chunk in r.iter_content():
h.update(chunk)
f.write(chunk)
actual_digest = h.hexdigest()
if actual_digest != release['sha256']:
raise ValueError(
'Error when checking the hash checksum: %s != %s' % (
release['sha256'], actual_digest))
uncompress_path = os.path.join(path, 'uncompress')
os.makedirs(uncompress_path, exist_ok=True)
nature = tarball.uncompress(filepath, uncompress_path)
_releases[version]['directory'] = uncompress_path
artifact = convert_to_hex(hashutil.hash_path(filepath))
artifact['archive_type'] = nature
for key, value in artifact.items():
_releases[version][key] = value
return _releases
class PyPiLoader(SWHStatelessLoader):
CONFIG_BASE_FILENAME = 'loader/pypi'
ADDITIONAL_CONFIG = {
'temp_directory': ('str', '/tmp/swh.loader.pypi/'),
'cache': ('bool', False),
'cache_dir': ('str', ''),
'debug': ('bool', False), # NOT FOR PRODUCTION
}
def __init__(self):
super().__init__(logging_class='swh.loader.pypi.PyPiLoader')
self.origin_id = None
self.temp_directory = self.config['temp_directory']
self.pypi_client = PyPiClient(
temp_directory=self.temp_directory,
cache=self.config['cache'],
cache_dir=self.config['cache_dir'])
self.debug = self.config['debug']
def pre_cleanup(self):
"""(override) To prevent disk explosion...
"""
clean_dangling_folders(self.temp_directory,
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log)
def cleanup(self):
"""(override) Clean up temporary disk use
"""
if self.debug:
self.log.warn('** DEBUG MODE ** Will not clean up temp dir %s' % (
self.temp_directory
))
return
if os.path.exists(self.temp_directory):
self.log.debug('Clean up %s' % self.temp_directory)
shutil.rmtree(self.temp_directory)
def prepare_origin_visit(self, project_name, origin_url,
origin_metadata_url=None):
"""(override) Prepare the origin visit information
"""
self.origin = {
'url': origin_url,
'type': 'pypi',
}
self.visit_date = None
def prepare(self, project_name, origin_url,
origin_metadata_url=None):
"""(override) Keep reference to the origin url (project) and the
project metadata url
"""
self.project_name = project_name
self.origin_url = origin_url
self.origin_metadata_url = origin_metadata_url
def get_contents(self):
return self.contents
def get_directories(self):
return self.directories()
def get_revisions(self):
return self.revisions
def get_releases(self):
return self.releases
def get_snapshot(self):
return self.snapshot
def fetch_data(self):
"""(override) Retrieve the pypi origin's information
"""
project_info = self.pypi_client.info(self.origin_metadata_url)
project = PyPiProject(project_info)
releases = self.pypi_client.retrieve_releases(
self.project_name, project.releases())
info = project.info()
author = project.author()
_contents = []
_directories = []
_revisions = []
_releases = []
_snapshot = {
'branches': {}
}
# for each
for version, release in releases.items():
_dir_path = release.pop('directory')
directory = Directory.from_disk(path=_dir_path.encode('utf-8'),
save_path=True)
_objects = directory.collect()
_contents.append(_objects['content'].values())
_directories.append(_objects['directory'].values())
_revision = {
'synthetic': True,
'metadata': {
'original_artifact': [release],
'project': info,
},
'author': author,
'date': release['date'],
'committer': author,
'committer_date': release['date'],
'name': release['name'],
'message': release['message'],
'directory': directory.hash,
'parents': [],
'type': 'tar',
}
_revision['id'] = identifier_to_bytes(
revision_identifier(_revision))
_revisions.append(_revision)
_release = {
'name': release['name'],
'author': author,
'date': release['date'],
'message': release['message'],
'target_type': 'revision',
'target': _revision['id'],
}
_release['id'] = identifier_to_bytes(
release_identifier(_release))
_releases.append(_release)
_snapshot['branches'][release['name']] = {
'target': _release['id'],
'target_type': 'release',
}
logging.debug('version: %s' % version)
logging.debug('release: %s' % release['directory'])
_snapshot['id'] = identifier_to_bytes(
snapshot_identifier(_snapshot))
self.contents = _contents
self.directories = _directories
self.revisions = _revisions
self.releases = _releases
self.snapshot = _snapshot
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
class PyPiProject:
"""PyPi project representation
"""
def __init__(self, data):
self.data = data
def info(self):
return {
'home_page': self.data['info']['home_page'],
'description': self.data['info']['description'],
'summary': self.data['info']['summary'],
'license': self.data['info']['license'],
'package_url': self.data['info']['package_url'],
'project_url': self.data['info']['project_url'],
'upstream': self.data['info']['project_urls']['Homepage'],
}
def author(self):
return {
'fullname': self.data['info']['author'],
'name': self.data['info']['author'],
'email': self.data['info']['author_email']
}
def releases(self):
releases = {}
for version, release in self.data['releases'].items():
if isinstance(release, list):
if len(release) > 1:
raise ValueError( # unexpected so fail so that we
# can fix later
'Unexpected list of more than 1 element, failing!')
release = release[0]
releases[version] = {
'name': version,
'message': release['comment_text'],
'sha256': release['digests']['sha256'],
'size': release['size'],
'filename': release['filename'],
'url': release['url'],
'date': release['upload_time'],
}
return releases
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.task import Task
from .loader import PyPiLoader
class LoadPyPiTsk(Task):
task_queue = 'swh_loader_pypi'
def run_task(self, project_name, project_url, project_metadata_url=None):
loader = PyPiLoader()
loader.log = self.log
return loader.load(project_name,
project_url,
origin_metadata_url=project_metadata_url)
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
from unittest import TestCase
from nose.tools import istest
from swh.loader.pypi.model import PyPiProject
class ModelTest(TestCase):
def setUp(self):
with open('./swh/loader/pypi/tests/test_model_data.json') as f:
self.data = json.load(f)
self.project = PyPiProject(self.data)
@istest
def info(self):
actual_info = self.project.info()
expected_info = {
'home_page': self.data['info']['home_page'],
'description': self.data['info']['description'],
'summary': self.data['info']['summary'],
'license': self.data['info']['license'],
'package_url': self.data['info']['package_url'],
'project_url': self.data['info']['project_url'],
'upstream': self.data['info']['project_urls']['Homepage'],
}
self.assertEqual(expected_info, actual_info)
@istest
def author(self):
actual_author = self.project.author()
expected_author = {
'fullname': self.data['info']['author'],
'name': self.data['info']['author'],
'email': self.data['info']['author_email'],
}
self.assertEqual(expected_author, actual_author)
@istest
def releases(self):
actual_releases = self.project.releases()
release0 = self.data['releases']['0.1'][0]
release1 = self.data['releases']['0.1.1'][0]
self.maxDiff = None
expected_releases = {
'0.1': {
'name': '0.1',
'message': release0['comment_text'],
'sha256': release0['digests']['sha256'],
'size': release0['size'],
'filename': release0['filename'],
'url': release0['url'],
'date': release0['upload_time'],
},
'0.1.1': {
'name': '0.1.1',
'message': release1['comment_text'],
'sha256': release1['digests']['sha256'],
'size': release1['size'],
'filename': release1['filename'],
'url': release1['url'],
'date': release1['upload_time'],
}
}
self.assertEqual(expected_releases, actual_releases)
@istest
def releases_unexpected_release_format(self):
data = self.data.copy()
data['releases']['0.1'].append({'anything': 'really to break'})
with self.assertRaisesRegex(ValueError,
'Unexpected list of more than 1'):
self.project.releases()
{
"info": {
"author": "bernardfrk",
"author_email": "bernard.frk@gmail.com",
"bugtrack_url": null,
"classifiers": [],
"description": "Utitilies to use the 7xydothis APIs",
"description_content_type": null,
"docs_url": null,
"download_url": "UNKNOWN",
"downloads": {
"last_day": -1,
"last_month": -1,
"last_week": -1
},
"home_page": "https://github.com/frkb/7xydothis",
"keywords": null,
"license": "UNKNOWN",
"maintainer": null,
"maintainer_email": null,
"name": "7xydothis",
"package_url": "https://pypi.org/project/7xydothis/",
"platform": "UNKNOWN",
"project_url": "https://pypi.org/project/7xydothis/",
"project_urls": {
"Download": "UNKNOWN",
"Homepage": "https://github.com/frkb/7xydothis"
},
"release_url": "https://pypi.org/project/7xydothis/0.1.1/",
"requires_dist": null,
"requires_python": null,
"summary": "Utitilies to use the 7xydothis APIs",
"version": "0.1.1"
},
"last_serial": 2668125,
"releases": {
"0.1": [
{
"comment_text": "",
"digests": {
"md5": "578e4bde98db732109d0698aba168a06",
"sha256": "7e6f59be532d43ac0ad32da6a068417f0973285a38a08f3f5056f79770f2f973"
},
"downloads": -1,
"filename": "7xydothis-0.1.tar.gz",
"has_sig": false,
"md5_digest": "578e4bde98db732109d0698aba168a06",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 868,
"upload_time": "2017-02-25T21:31:02",
"url": "https://files.pythonhosted.org/packages/68/55/6a00e46a1a10e7a0731e50cbcc9f6243c5112eeda8326d781a03a1254105/7xydothis-0.1.tar.gz"
}
],
"0.1.1": [
{
"comment_text": "",
"digests": {
"md5": "75fe55b933330adbde027b6edc74863d",
"sha256": "76d243b70a10d51ea87312a97a7d7b1a525984fd56d1c5f41650a1fa0fde1bc1"
},
"downloads": -1,
"filename": "7xydothis-0.1.1.tar.gz",
"has_sig": false,
"md5_digest": "75fe55b933330adbde027b6edc74863d",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 875,
"upload_time": "2017-02-25T21:41:37",
"url": "https://files.pythonhosted.org/packages/96/64/6fd8e189aa97820b306f06dbce02d618bf155379575c553db3d2c2eda045/7xydothis-0.1.1.tar.gz"
}
]
},
"urls": [
{
"comment_text": "",
"digests": {
"md5": "75fe55b933330adbde027b6edc74863d",
"sha256": "76d243b70a10d51ea87312a97a7d7b1a525984fd56d1c5f41650a1fa0fde1bc1"
},
"downloads": -1,
"filename": "7xydothis-0.1.1.tar.gz",
"has_sig": false,
"md5_digest": "75fe55b933330adbde027b6edc74863d",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 875,
"upload_time": "2017-02-25T21:41:37",
"url": "https://files.pythonhosted.org/packages/96/64/6fd8e189aa97820b306f06dbce02d618bf155379575c553db3d2c2eda045/7xydothis-0.1.1.tar.gz"
}
]
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment