Skip to content
Snippets Groups Projects
Verified Commit 27ea4219 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

pypi.client: Compute hashes and write the tarball in one roundtrip

This also checks the tarball size prior to reading the first bytes.

Related T421
parent 7b6560bd
No related branches found
No related tags found
1 merge request!1Bootstrap pypi loader
......@@ -11,9 +11,10 @@ Build-Depends: debhelper (>= 9),
python3-requests,
python3-setuptools,
python3-swh.core,
python3-swh.loader.core,
python3-swh.model (>= 0.0.27~),
python3-swh.storage,
python3-swh.scheduler,
python3-swh.loader.core,
python3-vcversioner
Standards-Version: 3.9.6
Homepage: https://forge.softwareheritage.org/source/swh-loader-pypi.git
......@@ -22,6 +23,7 @@ Package: python3-swh.loader.pypi
Architecture: all
Depends: python3-swh.core,
python3-swh.loader.core,
python3-swh.model (>= 0.0.27~),
python3-swh.storage,
${misc:Depends}, ${python3:Depends}
Description: Software Heritage PyPI Loader
swh.core
swh.model >= 0.0.27
swh.storage
swh.scheduler
swh.loader.core
......@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import arrow
import hashlib
import logging
import os
import requests
......@@ -23,30 +22,6 @@ except ImportError:
__version__ = 'devel'
def convert_to_hex(d):
"""Convert a flat dictionary with bytes in values to the same dictionary
with hex as values.
Args:
dict: flat dictionary with sha bytes in their values.
Returns:
Mirror dictionary with values as string hex.
"""
if not d:
return d
checksums = {}
for key, h in d.items():
if isinstance(h, bytes):
checksums[key] = hashutil.hash_to_hex(h)
else:
checksums[key] = h
return checksums
def _to_dict(pkginfo):
"""Given a pkginfo parsed structure, convert it to a dict.
......@@ -293,9 +268,12 @@ class PyPIClient:
if self.do_cache:
_filepath = self._get_raw(filepath)
if not _filepath: # no cache hit, we fetch from pypi
if _filepath: # cache hit
hashes = hashutil.hash_path(
filepath, with_length=False, hexdigest=True)
else: # no cache hit, we fetch from pypi
url = artifact['url']
r = self.session.get(url, **self.params)
r = self.session.get(url, **self.params, stream=True)
status = r.status_code
if status != 200:
if status == 404:
......@@ -305,37 +283,31 @@ class PyPIClient:
url, r.status_code, r.content)
raise ValueError(msg)
_len = len(r.content)
_len = int(r.headers['content-length'])
if _len != artifact['size']:
raise ValueError('Error when checking size: %s != %s' % (
artifact['size'], _len))
# checking digest and writing
h = hashlib.sha256()
with open(filepath, 'wb') as f:
for chunk in r.iter_content():
h.update(chunk)
def write_chunk(chunk, f=f):
f.write(chunk)
hashes = hashutil.hash_stream(r, length=_len, hexdigest=True,
chunk_cb=write_chunk)
actual_digest = h.hexdigest()
if actual_digest != artifact['sha256']:
raise ValueError(
'%s %s: Checksum mismatched: %s != %s' % (
project, version, artifact['sha256'], actual_digest))
actual_digest = hashes['sha256']
if actual_digest != artifact['sha256']:
raise ValueError(
'%s %s: Checksum mismatched: %s != %s' % (
project, version, artifact['sha256'], actual_digest))
if self.do_cache:
self._save_raw(filepath)
if not _filepath and self.do_cache:
self._save_raw(filepath)
uncompress_path = os.path.join(path, 'uncompress')
os.makedirs(uncompress_path, exist_ok=True)
nature = tarball.uncompress(filepath, uncompress_path)
hashes = hashutil.hash_path(filepath)
hashes.pop('length') # 'size' entry is already referenced
artifact_hashes = convert_to_hex(hashes)
artifact['archive_type'] = nature
artifact.update(artifact_hashes)
artifact.update(hashes)
pkginfo = _project_pkginfo(uncompress_path)
return release, artifact, filepath, uncompress_path, pkginfo
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment