Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-model
  • lunar/swh-model
  • franckbret/swh-model
  • douardda/swh-model
  • olasd/swh-model
  • swh/devel/swh-model
  • Alphare/swh-model
  • samplet/swh-model
  • marmoute/swh-model
  • rboyer/swh-model
10 results
Show changes
Showing
with 5367 additions and 1520 deletions
......@@ -33,11 +33,12 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
NON_FIELD_ERRORS = '__all__'
NON_FIELD_ERRORS = "__all__"
class ValidationError(Exception):
"""An error while validating data."""
def __init__(self, message, code=None, params=None):
"""
The `message` argument can be a single error, a list of errors, or a
......@@ -54,16 +55,15 @@ class ValidationError(Exception):
message = message[0]
if isinstance(message, ValidationError):
if hasattr(message, 'error_dict'):
if hasattr(message, "error_dict"):
message = message.error_dict
# PY2 has a `message` property which is always there so we can't
# duck-type on it. It was introduced in Python 2.5 and already
# deprecated in Python 2.6.
elif not hasattr(message, 'message'):
elif not hasattr(message, "message"):
message = message.error_list
else:
message, code, params = (message.message, message.code,
message.params)
message, code, params = (message.message, message.code, message.params)
if isinstance(message, dict):
self.error_dict = {}
......@@ -78,9 +78,8 @@ class ValidationError(Exception):
# Normalize plain strings to instances of ValidationError.
if not isinstance(message, ValidationError):
message = ValidationError(message)
if hasattr(message, 'error_dict'):
self.error_list.extend(sum(message.error_dict.values(),
[]))
if hasattr(message, "error_dict"):
self.error_list.extend(sum(message.error_dict.values(), []))
else:
self.error_list.extend(message.error_list)
......@@ -94,18 +93,18 @@ class ValidationError(Exception):
def message_dict(self):
# Trigger an AttributeError if this ValidationError
# doesn't have an error_dict.
getattr(self, 'error_dict')
getattr(self, "error_dict")
return dict(self)
@property
def messages(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
return sum(dict(self).values(), [])
return list(self)
def update_error_dict(self, error_dict):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
for field, error_list in self.error_dict.items():
error_dict.setdefault(field, []).extend(error_list)
else:
......@@ -113,7 +112,7 @@ class ValidationError(Exception):
return error_dict
def __iter__(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
for field, errors in self.error_dict.items():
yield field, list(ValidationError(errors))
else:
......@@ -124,9 +123,13 @@ class ValidationError(Exception):
yield message
def __str__(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
return repr(dict(self))
return repr(list(self))
def __repr__(self):
return 'ValidationError(%s)' % self
return "ValidationError(%s)" % self
class InvalidDirectoryPath(Exception):
pass
......@@ -6,8 +6,13 @@
# We do our imports here but we don't use them, so flake8 complains
# flake8: noqa
from .simple import (validate_type, validate_int, validate_str, validate_bytes,
validate_datetime, validate_enum)
from .hashes import (validate_sha1, validate_sha1_git, validate_sha256)
from .compound import (validate_against_schema, validate_all_keys,
validate_any_key)
from .compound import validate_against_schema, validate_all_keys, validate_any_key
from .hashes import validate_sha1, validate_sha1_git, validate_sha256
from .simple import (
validate_bytes,
validate_datetime,
validate_enum,
validate_int,
validate_str,
validate_type,
)
......@@ -6,7 +6,7 @@
from collections import defaultdict
import itertools
from ..exceptions import ValidationError, NON_FIELD_ERRORS
from ..exceptions import NON_FIELD_ERRORS, ValidationError
def validate_against_schema(model, schema, value):
......@@ -26,19 +26,19 @@ def validate_against_schema(model, schema, value):
if not isinstance(value, dict):
raise ValidationError(
'Unexpected type %(type)s for %(model)s, expected dict',
"Unexpected type %(type)s for %(model)s, expected dict",
params={
'model': model,
'type': value.__class__.__name__,
"model": model,
"type": value.__class__.__name__,
},
code='model-unexpected-type',
code="model-unexpected-type",
)
errors = defaultdict(list)
for key, (mandatory, validators) in itertools.chain(
((k, v) for k, v in schema.items() if k != NON_FIELD_ERRORS),
[(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))]
[(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))],
):
if not validators:
continue
......@@ -54,9 +54,9 @@ def validate_against_schema(model, schema, value):
if mandatory:
errors[key].append(
ValidationError(
'Field %(field)s is mandatory',
params={'field': key},
code='model-field-mandatory',
"Field %(field)s is mandatory",
params={"field": key},
code="model-field-mandatory",
)
)
......@@ -74,19 +74,21 @@ def validate_against_schema(model, schema, value):
else:
if not valid:
errdata = {
'validator': validator.__name__,
"validator": validator.__name__,
}
if key == NON_FIELD_ERRORS:
errmsg = 'Validation of model %(model)s failed in ' \
'%(validator)s'
errdata['model'] = model
errcode = 'model-validation-failed'
errmsg = (
"Validation of model %(model)s failed in " "%(validator)s"
)
errdata["model"] = model
errcode = "model-validation-failed"
else:
errmsg = 'Validation of field %(field)s failed in ' \
'%(validator)s'
errdata['field'] = key
errcode = 'field-validation-failed'
errmsg = (
"Validation of field %(field)s failed in " "%(validator)s"
)
errdata["field"] = key
errcode = "field-validation-failed"
errors[key].append(
ValidationError(errmsg, params=errdata, code=errcode)
......@@ -102,11 +104,11 @@ def validate_all_keys(value, keys):
"""Validate that all the given keys are present in value"""
missing_keys = set(keys) - set(value)
if missing_keys:
missing_fields = ', '.join(sorted(missing_keys))
missing_fields = ", ".join(sorted(missing_keys))
raise ValidationError(
'Missing mandatory fields %(missing_fields)s',
params={'missing_fields': missing_fields},
code='missing-mandatory-field'
"Missing mandatory fields %(missing_fields)s",
params={"missing_fields": missing_fields},
code="missing-mandatory-field",
)
return True
......@@ -116,11 +118,11 @@ def validate_any_key(value, keys):
"""Validate that any of the given keys is present in value"""
present_keys = set(keys) & set(value)
if not present_keys:
missing_fields = ', '.join(sorted(keys))
missing_fields = ", ".join(sorted(keys))
raise ValidationError(
'Must contain one of the alternative fields %(missing_fields)s',
params={'missing_fields': missing_fields},
code='missing-alternative-field',
"Must contain one of the alternative fields %(missing_fields)s",
params={"missing_fields": missing_fields},
code="missing-alternative-field",
)
return True
......@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
import string
from ..exceptions import ValidationError
......@@ -22,22 +23,22 @@ def validate_hash(value, hash_type):
"""
hash_lengths = {
'sha1': 20,
'sha1_git': 20,
'sha256': 32,
"sha1": 20,
"sha1_git": 20,
"sha256": 32,
}
hex_digits = set(string.hexdigits)
if hash_type not in hash_lengths:
raise ValidationError(
'Unexpected hash type %(hash_type)s, expected one of'
' %(hash_types)s',
"Unexpected hash type %(hash_type)s, expected one of" " %(hash_types)s",
params={
'hash_type': hash_type,
'hash_types': ', '.join(sorted(hash_lengths)),
"hash_type": hash_type,
"hash_types": ", ".join(sorted(hash_lengths)),
},
code='unexpected-hash-type')
code="unexpected-hash-type",
)
if isinstance(value, str):
errors = []
......@@ -48,10 +49,10 @@ def validate_hash(value, hash_type):
"Unexpected characters `%(unexpected_chars)s' for hash "
"type %(hash_type)s",
params={
'unexpected_chars': ', '.join(sorted(extra_chars)),
'hash_type': hash_type,
"unexpected_chars": ", ".join(sorted(extra_chars)),
"hash_type": hash_type,
},
code='unexpected-hash-contents',
code="unexpected-hash-contents",
)
)
......@@ -60,14 +61,14 @@ def validate_hash(value, hash_type):
if length != expected_length:
errors.append(
ValidationError(
'Unexpected length %(length)d for hash type '
'%(hash_type)s, expected %(expected_length)d',
"Unexpected length %(length)d for hash type "
"%(hash_type)s, expected %(expected_length)d",
params={
'length': length,
'expected_length': expected_length,
'hash_type': hash_type,
"length": length,
"expected_length": expected_length,
"hash_type": hash_type,
},
code='unexpected-hash-length',
code="unexpected-hash-length",
)
)
......@@ -81,37 +82,37 @@ def validate_hash(value, hash_type):
expected_length = hash_lengths[hash_type]
if length != expected_length:
raise ValidationError(
'Unexpected length %(length)d for hash type '
'%(hash_type)s, expected %(expected_length)d',
"Unexpected length %(length)d for hash type "
"%(hash_type)s, expected %(expected_length)d",
params={
'length': length,
'expected_length': expected_length,
'hash_type': hash_type,
"length": length,
"expected_length": expected_length,
"hash_type": hash_type,
},
code='unexpected-hash-length',
code="unexpected-hash-length",
)
return True
raise ValidationError(
'Unexpected type %(type)s for hash, expected str or bytes',
"Unexpected type %(type)s for hash, expected str or bytes",
params={
'type': value.__class__.__name__,
"type": value.__class__.__name__,
},
code='unexpected-hash-value-type',
code="unexpected-hash-value-type",
)
def validate_sha1(sha1):
"""Validate that sha1 is a valid sha1 hash"""
return validate_hash(sha1, 'sha1')
return validate_hash(sha1, "sha1")
def validate_sha1_git(sha1_git):
"""Validate that sha1_git is a valid sha1_git hash"""
return validate_hash(sha1_git, 'sha1_git')
return validate_hash(sha1_git, "sha1_git")
def validate_sha256(sha256):
"""Validate that sha256 is a valid sha256 hash"""
return validate_hash(sha256, 'sha256')
return validate_hash(sha256, "sha256")
......@@ -13,16 +13,16 @@ def validate_type(value, type):
"""Validate that value is an integer"""
if not isinstance(value, type):
if isinstance(type, tuple):
typestr = 'one of %s' % ', '.join(typ.__name__ for typ in type)
typestr = "one of %s" % ", ".join(typ.__name__ for typ in type)
else:
typestr = type.__name__
raise ValidationError(
'Unexpected type %(type)s, expected %(expected_type)s',
"Unexpected type %(type)s, expected %(expected_type)s",
params={
'type': value.__class__.__name__,
'expected_type': typestr,
"type": value.__class__.__name__,
"expected_type": typestr,
},
code='unexpected-type'
code="unexpected-type",
)
return True
......@@ -54,10 +54,12 @@ def validate_datetime(value):
errors.append(e)
if isinstance(value, datetime.datetime) and value.tzinfo is None:
errors.append(ValidationError(
'Datetimes must be timezone-aware in swh',
code='datetime-without-tzinfo',
))
errors.append(
ValidationError(
"Datetimes must be timezone-aware in swh",
code="datetime-without-tzinfo",
)
)
if errors:
raise ValidationError(errors)
......@@ -69,12 +71,12 @@ def validate_enum(value, expected_values):
"""Validate that value is contained in expected_values"""
if value not in expected_values:
raise ValidationError(
'Unexpected value %(value)s, expected one of %(expected_values)s',
"Unexpected value %(value)s, expected one of %(expected_values)s",
params={
'value': value,
'expected_values': ', '.join(sorted(expected_values)),
"value": value,
"expected_values": ", ".join(sorted(expected_values)),
},
code='unexpected-value',
code="unexpected-value",
)
return True
This diff is collapsed.
# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import stat
from enum import Enum
from swh.model import hashutil, identifiers
ROOT_TREE_KEY = b''
class GitType(Enum):
BLOB = b'blob'
TREE = b'tree'
EXEC = b'exec'
LINK = b'link'
COMM = b'commit'
RELE = b'release'
REFS = b'ref'
class GitPerm(Enum):
BLOB = b'100644'
TREE = b'40000'
EXEC = b'100755'
LINK = b'120000'
def _compute_directory_git_sha1(hashes):
"""Compute a directory git sha1 from hashes.
Args:
hashes: list of tree entries with keys:
- sha1_git: the tree entry's sha1
- name: file or subdir's name
- perms: the tree entry's sha1 permissions
Returns:
the binary sha1 of the dictionary's identifier
Assumes:
Every path exists in hashes.
"""
directory = {
'entries':
[
{
'name': entry['name'],
'perms': int(entry['perms'].value, 8),
'target': entry['sha1_git'],
'type': 'dir' if entry['perms'] == GitPerm.TREE else 'file',
}
for entry in hashes
]
}
return hashutil.hash_to_bytes(identifiers.directory_identifier(directory))
def compute_directory_git_sha1(dirpath, hashes):
"""Compute a directory git sha1 for a dirpath.
Args:
dirpath: the directory's absolute path
hashes: list of tree entries with keys:
- sha1_git: the tree entry's sha1
- name: file or subdir's name
- perms: the tree entry's sha1 permissions
Returns:
the binary sha1 of the dictionary's identifier
Assumes:
Every path exists in hashes.
"""
return _compute_directory_git_sha1(hashes[dirpath])
def compute_revision_sha1_git(revision):
"""Compute a revision sha1 git from its dict representation.
Args:
revision: Additional dictionary information needed to compute a
synthetic
revision. Following keys are expected:
- author
- date
- committer
- committer_date
- message
- type
- directory: binary form of the tree hash
Returns:
revision sha1 in bytes
# FIXME: beware, bytes output from storage api
"""
return hashutil.hash_to_bytes(identifiers.revision_identifier(revision))
def compute_release_sha1_git(release):
"""Compute a release sha1 git from its dict representation.
Args:
release: Additional dictionary information needed to compute a
synthetic release. Following keys are expected:
- name
- message
- date
- author
- revision: binary form of the sha1_git revision targeted by this
Returns:
release sha1 in bytes
"""
return hashutil.hash_to_bytes(identifiers.release_identifier(release))
def compute_link_metadata(linkpath):
"""Given a linkpath, compute the git metadata.
Args:
linkpath: absolute pathname of the link
Returns:
Dictionary of values:
- data: link's content
- length: link's content length
- name: basename of the link
- perms: git permission for link
- type: git type for link
- path: absolute path to the link on filesystem
"""
data = os.readlink(linkpath)
link_metadata = hashutil.hash_data(data)
link_metadata.update({
'data': data,
'length': len(data),
'name': os.path.basename(linkpath),
'perms': GitPerm.LINK,
'type': GitType.BLOB,
'path': linkpath
})
return link_metadata
def compute_blob_metadata(filepath):
"""Given a filepath resolving to a regular file, compute the metadata.
Other file types (fifo, character or block device, symlink) will
be considered empty regular file. To deal properly with symlinks,
use swh.model.git.compute_link_metadata.
Args:
filepath: absolute pathname of the regular file.
Returns:
Dictionary of values:
- name: basename of the file
- length: data length
- perms: git permission for file
- type: git type for file
- path: absolute filepath on filesystem
"""
mode = os.lstat(filepath).st_mode
if not stat.S_ISREG(mode): # special (block or character device, fifo)
perms = GitPerm.BLOB
blob_metadata = hashutil.hash_data(b'')
blob_metadata['length'] = 0
else:
perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB
blob_metadata = hashutil.hash_path(filepath)
blob_metadata.update({
'name': os.path.basename(filepath),
'perms': perms,
'type': GitType.BLOB,
'path': filepath
})
return blob_metadata
def _compute_tree_metadata(dirname, hashes):
"""Given a dirname, compute the git metadata.
Args:
dirname: absolute pathname of the directory.
hashes: list of tree dirname's entries with keys:
- sha1_git: the tree entry's sha1
- name: file or subdir's name
- perms: the tree entry's sha1 permissions
Returns:
Dictionary of values:
- sha1_git: tree's sha1 git
- name: basename of the directory
- perms: git permission for directory
- type: git type for directory
- path: absolute path to directory on filesystem
"""
return {
'sha1_git': _compute_directory_git_sha1(hashes),
'name': os.path.basename(dirname),
'perms': GitPerm.TREE,
'type': GitType.TREE,
'path': dirname
}
def compute_tree_metadata(dirname, ls_hashes):
"""Given a dirname, compute the git metadata.
Args:
dirname: absolute pathname of the directory.
ls_hashes: dictionary of path, hashes
Returns:
Dictionary of values:
- sha1_git: tree's sha1 git
- name: basename of the directory
- perms: git permission for directory
- type: git type for directory
- path: absolute path to directory on filesystem
"""
return _compute_tree_metadata(dirname, ls_hashes[dirname])
def default_validation_dir(dirpath):
"""Default validation function.
This is the equivalent of the identity function.
Args:
dirpath: Path to validate
Returns: True
"""
return True
def _walk(rootdir,
dir_ok_fn=default_validation_dir,
remove_empty_folder=False):
"""Walk the filesystem and yields a 3 tuples (dirpath, dirnames as set
of absolute paths, filenames as set of abslute paths)
Ignore files which won't pass the dir_ok_fn validation.
If remove_empty_folder is True, remove and ignore any
encountered empty folder.
Args:
- rootdir: starting walk root directory path
- dir_ok_fn: validation function. if folder encountered are
not ok, they are ignored. Default to default_validation_dir
which does nothing.
- remove_empty_folder: Flag to remove and ignore any
encountered empty folders.
Yields:
3 tuples dirpath, set of absolute children dirname paths, set
of absolute filename paths.
"""
def basic_gen_dir(rootdir):
for dp, dns, fns in os.walk(rootdir, topdown=False):
yield (dp,
set((os.path.join(dp, dn) for dn in dns)),
set((os.path.join(dp, fn) for fn in fns)))
if dir_ok_fn == default_validation_dir:
if not remove_empty_folder: # os.walk
yield from basic_gen_dir(rootdir)
else: # os.walk + empty dir cleanup
empty_folders = set()
for dp, dns, fns in basic_gen_dir(rootdir):
if not dns and not fns:
empty_folders.add(dp)
# need to remove it because folder of empty folder
# is an empty folder!!!
if os.path.islink(dp):
os.remove(dp)
else:
os.rmdir(dp)
parent = os.path.dirname(dp)
# edge case about parent containing one empty
# folder which become an empty one
while not os.listdir(parent):
empty_folders.add(parent)
if os.path.islink(parent):
os.remove(parent)
else:
os.rmdir(parent)
parent = os.path.dirname(parent)
continue
yield (dp, dns - empty_folders, fns)
else:
def filtfn(dirnames):
return set(filter(dir_ok_fn, dirnames))
gen_dir = ((dp, dns, fns) for dp, dns, fns
in basic_gen_dir(rootdir) if dir_ok_fn(dp))
if not remove_empty_folder: # os.walk + filtering
for dp, dns, fns in gen_dir:
yield (dp, filtfn(dns), fns)
else: # os.walk + filtering + empty dir cleanup
empty_folders = set()
for dp, dns, fns in gen_dir:
dps = filtfn(dns)
if not dps and not fns:
empty_folders.add(dp)
# need to remove it because folder of empty folder
# is an empty folder!!!
if os.path.islink(dp):
os.remove(dp)
else:
os.rmdir(dp)
parent = os.path.dirname(dp)
# edge case about parent containing one empty
# folder which become an empty one
while not os.listdir(parent):
empty_folders.add(parent)
if os.path.islink(parent):
os.remove(parent)
else:
os.rmdir(parent)
parent = os.path.dirname(parent)
continue
yield dp, dps - empty_folders, fns
def walk_and_compute_sha1_from_directory(rootdir,
dir_ok_fn=default_validation_dir,
with_root_tree=True,
remove_empty_folder=False):
"""(Deprecated) TODO migrate the code to
compute_hashes_from_directory.
Compute git sha1 from directory rootdir.
Args:
- rootdir: Root directory from which beginning the git hash computation
- dir_ok_fn: Filter function to filter directory according to rules
defined in the function. By default, all folders are ok.
Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath
- with_root_tree: Determine if we compute the upper root tree's
checksums. As a default, we want it. One possible use case where this
is not useful is the update (cf. `update_checksums_from`)
Returns:
Dictionary of entries with keys <path-name> and as values a list of
directory entries.
Those are list of dictionary with keys:
- 'perms'
- 'type'
- 'name'
- 'sha1_git'
- and specifically content: 'sha1', 'sha256', ...
Note:
One special key is ROOT_TREE_KEY to indicate the upper root of the
directory (this is the revision's directory).
Raises:
Nothing
If something is raised, this is a programmatic error.
"""
ls_hashes = {}
all_links = set()
if rootdir.endswith(b'/'):
rootdir = rootdir.rstrip(b'/')
for dirpath, dirnames, filenames in _walk(
rootdir, dir_ok_fn, remove_empty_folder):
hashes = []
links = (file
for file in filenames.union(dirnames)
if os.path.islink(file))
for linkpath in links:
all_links.add(linkpath)
m_hashes = compute_link_metadata(linkpath)
hashes.append(m_hashes)
for filepath in (file for file in filenames if file not in all_links):
m_hashes = compute_blob_metadata(filepath)
hashes.append(m_hashes)
ls_hashes[dirpath] = hashes
dir_hashes = []
for fulldirname in (dir for dir in dirnames if dir not in all_links):
tree_hash = _compute_tree_metadata(fulldirname,
ls_hashes[fulldirname])
dir_hashes.append(tree_hash)
ls_hashes[dirpath].extend(dir_hashes)
if with_root_tree:
# compute the current directory hashes
root_hash = {
'sha1_git': _compute_directory_git_sha1(ls_hashes[rootdir]),
'path': rootdir,
'name': os.path.basename(rootdir),
'perms': GitPerm.TREE,
'type': GitType.TREE
}
ls_hashes[ROOT_TREE_KEY] = [root_hash]
return ls_hashes
def compute_hashes_from_directory(rootdir,
dir_ok_fn=default_validation_dir,
remove_empty_folder=False):
"""Compute git sha1 from directory rootdir.
Args:
- rootdir: Root directory from which beginning the git hash
computation
- dir_ok_fn: Filter function to filter directory according to rules
defined in the function. By default, all folders are ok.
Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath
Returns:
Dictionary of entries with keys absolute path name.
Path-name can be a file/link or directory.
The associated value is a dictionary with:
- checksums: the dictionary with the hashes for the link/file/dir
Those are list of dictionary with keys:
- 'perms'
- 'type'
- 'name'
- 'sha1_git'
- and specifically content: 'sha1', 'sha256', ...
- children: Only for a directory, the set of children paths
Note:
One special key is the / which indicates the upper root of
the directory (this is the revision's directory).
Raises:
Nothing
If something is raised, this is a programmatic error.
"""
def _get_dict_from_dirpath(_dict, path):
"""Retrieve the default associated value for key path.
"""
return _dict.get(path, dict(children=set(), checksums=None))
def _get_dict_from_filepath(_dict, path):
"""Retrieve the default associated value for key path.
"""
return _dict.get(path, dict(checksums=None))
ls_hashes = {}
all_links = set()
if rootdir.endswith(b'/'):
rootdir = rootdir.rstrip(b'/')
for dirpath, dirnames, filenames in _walk(
rootdir, dir_ok_fn, remove_empty_folder):
dir_entry = _get_dict_from_dirpath(ls_hashes, dirpath)
children = dir_entry['children']
links = (file
for file in filenames.union(dirnames)
if os.path.islink(file))
for linkpath in links:
all_links.add(linkpath)
m_hashes = compute_link_metadata(linkpath)
d = _get_dict_from_filepath(ls_hashes, linkpath)
d['checksums'] = m_hashes
ls_hashes[linkpath] = d
children.add(linkpath)
for filepath in (file for file in filenames if file not in all_links):
m_hashes = compute_blob_metadata(filepath)
d = _get_dict_from_filepath(ls_hashes, filepath)
d['checksums'] = m_hashes
ls_hashes[filepath] = d
children.add(filepath)
for fulldirname in (dir for dir in dirnames if dir not in all_links):
d_hashes = _get_dict_from_dirpath(ls_hashes, fulldirname)
tree_hash = _compute_tree_metadata(
fulldirname,
(ls_hashes[p]['checksums'] for p in d_hashes['children'])
)
d = _get_dict_from_dirpath(ls_hashes, fulldirname)
d['checksums'] = tree_hash
ls_hashes[fulldirname] = d
children.add(fulldirname)
dir_entry['children'] = children
ls_hashes[dirpath] = dir_entry
# compute the current directory hashes
d_hashes = _get_dict_from_dirpath(ls_hashes, rootdir)
root_hash = {
'sha1_git': _compute_directory_git_sha1(
(ls_hashes[p]['checksums'] for p in d_hashes['children'])
),
'path': rootdir,
'name': os.path.basename(rootdir),
'perms': GitPerm.TREE,
'type': GitType.TREE
}
d_hashes['checksums'] = root_hash
ls_hashes[rootdir] = d_hashes
return ls_hashes
def children_hashes(children, objects):
"""Given a collection of children path, yield the corresponding
hashes.
Args:
objects: objects hash as returned by git.compute_hashes_from_directory.
children: collection of bytes path
Yields:
Dictionary hashes
"""
for p in children:
c = objects.get(p)
if c:
h = c.get('checksums')
if h:
yield h
def objects_per_type(filter_type, objects_per_path):
"""Given an object dictionary returned by
`swh.model.git.compute_hashes_from_directory`, yields
corresponding element type's hashes
Args:
filter_type: one of GitType enum
objects_per_path:
Yields:
Elements of type filter_type's hashes
"""
for path, obj in objects_per_path.items():
o = obj['checksums']
if o['type'] == filter_type:
if 'children' in obj: # for trees
if obj['children']:
o['children'] = children_hashes(obj['children'],
objects_per_path)
else:
o['children'] = []
yield o
This diff is collapsed.
# Copyright (C) 2015-2017 The Software Heritage developers
# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -10,61 +10,207 @@ Only a subset of hashing algorithms is supported as defined in the
ALGORITHMS set. Any provided algorithms not in that list will result
in a ValueError explaining the error.
This modules defines the following hashing functions:
This module defines a MultiHash class to ease the softwareheritage
hashing algorithms computation. This allows to compute hashes from
file object, path, data using a similar interface as what the standard
hashlib module provides.
- hash_file: Hash the contents of the given file object with the given
algorithms (defaulting to DEFAULT_ALGORITHMS if none provided).
Basic usage examples:
- hash_data: Hash the given binary blob with the given algorithms
(defaulting to DEFAULT_ALGORITHMS if none provided).
- file object: MultiHash.from_file(
file_object, hash_names=DEFAULT_ALGORITHMS).digest()
- path (filepath): MultiHash.from_path(b'foo').hexdigest()
- data (bytes): MultiHash.from_data(b'foo').bytehexdigest()
"Complex" usage, defining a swh hashlib instance first:
- To compute length, integrate the length to the set of algorithms to
compute, for example:
.. code-block:: python
h = MultiHash(hash_names=set({'length'}).union(DEFAULT_ALGORITHMS))
with open(filepath, 'rb') as f:
h.update(f.read(HASH_BLOCK_SIZE))
hashes = h.digest() # returns a dict of {hash_algo_name: hash_in_bytes}
- Write alongside computing hashing algorithms (from a stream), example:
.. code-block:: python
h = MultiHash(length=length)
with open(filepath, 'wb') as f:
for chunk in r.iter_content(): # r a stream of sort
h.update(chunk)
f.write(chunk)
hashes = h.hexdigest() # returns a dict of {hash_algo_name: hash_in_hex}
- hash_path: Hash the contents of the file at the given path with the
given algorithms (defaulting to DEFAULT_ALGORITHMS if none
provided).
"""
import binascii
import functools
import hashlib
from io import BytesIO
import os
import sys
from typing import Callable, Dict, Optional, Union
from io import BytesIO
ALGORITHMS = set(
["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
)
"""Hashing algorithms supported by this module"""
# Supported algorithms
ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512'])
DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
"""Algorithms computed by default when calling the functions from this module.
# Default algorithms used
DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git'])
Subset of :const:`ALGORITHMS`.
"""
# should be a multiple of 64 (sha1/sha256's block size)
# FWIW coreutils' sha1sum uses 32768
HASH_BLOCK_SIZE = 32768
"""Block size for streaming hash computations made in this module"""
_blake2_hash_cache: Dict[str, Callable] = {}
# Prior to python3.4, only blake2 is available through pyblake2 module
# From 3.5 onwards, it's been integrated in python
if sys.version_info.major == 3 and sys.version_info.minor <= 4:
import pyblake2
# register those hash algorithms in hashlib
__cache = hashlib.__builtin_constructor_cache
__cache['blake2s256'] = pyblake2.blake2s
__cache['blake2b512'] = pyblake2.blake2b
class MultiHash:
"""Hashutil class to support multiple hashes computation.
Args:
hash_names (set): Set of hash algorithms (+ optionally length)
to compute hashes (cf. DEFAULT_ALGORITHMS)
length (int): Length of the total sum of chunks to read
def _new_git_hash(base_algo, git_type, length):
"""Initialize a digest object (as returned by python's hashlib) for the
requested algorithm, and feed it with the header for a git object of the
given type and length.
If the length is provided as algorithm, the length is also
computed and returned.
The header for hashing a git object consists of:
"""
def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None):
self.state = {}
self.track_length = False
for name in hash_names:
if name == "length":
self.state["length"] = 0
self.track_length = True
else:
self.state[name] = _new_hash(name, length)
@classmethod
def from_state(cls, state, track_length):
ret = cls([])
ret.state = state
ret.track_length = track_length
@classmethod
def from_file(cls, fobj, hash_names=DEFAULT_ALGORITHMS, length=None):
ret = cls(length=length, hash_names=hash_names)
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
ret.update(chunk)
return ret
@classmethod
def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS):
length = os.path.getsize(path)
with open(path, "rb") as f:
ret = cls.from_file(f, hash_names=hash_names, length=length)
return ret
@classmethod
def from_data(cls, data, hash_names=DEFAULT_ALGORITHMS):
length = len(data)
fobj = BytesIO(data)
return cls.from_file(fobj, hash_names=hash_names, length=length)
def update(self, chunk):
for name, h in self.state.items():
if name == "length":
continue
h.update(chunk)
if self.track_length:
self.state["length"] += len(chunk)
def digest(self):
return {
name: h.digest() if name != "length" else h
for name, h in self.state.items()
}
def hexdigest(self):
return {
name: h.hexdigest() if name != "length" else h
for name, h in self.state.items()
}
def bytehexdigest(self):
return {
name: hash_to_bytehex(h.digest()) if name != "length" else h
for name, h in self.state.items()
}
def copy(self):
copied_state = {
name: h.copy() if name != "length" else h for name, h in self.state.items()
}
return self.from_state(copied_state, self.track_length)
def _new_blake2_hash(algo):
"""Return a function that initializes a blake2 hash."""
if algo in _blake2_hash_cache:
return _blake2_hash_cache[algo]()
lalgo = algo.lower()
if not lalgo.startswith("blake2"):
raise ValueError("Algorithm %s is not a blake2 hash" % algo)
blake_family = lalgo[:7]
digest_size = None
if lalgo[7:]:
try:
digest_size, remainder = divmod(int(lalgo[7:]), 8)
except ValueError:
raise ValueError("Unknown digest size for algo %s" % algo) from None
if remainder:
raise ValueError(
"Digest size for algorithm %s must be a multiple of 8" % algo
)
blake2 = getattr(hashlib, blake_family)
_blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
return _blake2_hash_cache[algo]()
def _new_hashlib_hash(algo):
"""Initialize a digest object from hashlib.
Handle the swh-specific names for the blake2-related algorithms
"""
if algo.startswith("blake2"):
return _new_blake2_hash(algo)
else:
return hashlib.new(algo)
def git_object_header(git_type: str, length: int) -> bytes:
"""Returns the header for a git object of the given type and length.
The header of a git object consists of:
- The type of the object (encoded in ASCII)
- One ASCII space (\x20)
- The length of the object (decimal encoded in ASCII)
- One NUL byte
Args:
base_algo: a hashlib-supported algorithm
base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm
git_type: the type of the git object (supposedly one of 'blob',
'commit', 'tag', 'tree')
length: the length of the git object you're encoding
......@@ -72,15 +218,26 @@ def _new_git_hash(base_algo, git_type, length):
Returns:
a hashutil.hash object
"""
git_object_types = {
"blob",
"tree",
"commit",
"tag",
"snapshot",
"raw_extrinsic_metadata",
"extid",
}
h = hashlib.new(base_algo)
git_header = '%s %d\0' % (git_type, length)
h.update(git_header.encode('ascii'))
if git_type not in git_object_types:
raise ValueError(
"Unexpected git object type %s, expected one of %s"
% (git_type, ", ".join(sorted(git_object_types)))
)
return h
return ("%s %d\0" % (git_type, length)).encode("ascii")
def _new_hash(algo, length=None):
def _new_hash(algo: str, length: Optional[int] = None):
"""Initialize a digest object (as returned by python's hashlib) for
the requested algorithm. See the constant ALGORITHMS for the list
of supported algorithms. If a git-specific hashing algorithm is
......@@ -90,7 +247,7 @@ def _new_hash(algo, length=None):
Args:
algo (str): a hashing algorithm (one of ALGORITHMS)
length (int): the length of the hashed payload (needed for
git-specific algorithms)
git-specific algorithms)
Returns:
a hashutil.hash object
......@@ -102,87 +259,22 @@ def _new_hash(algo, length=None):
"""
if algo not in ALGORITHMS:
raise ValueError(
'Unexpected hashing algorithm %s, expected one of %s' %
(algo, ', '.join(sorted(ALGORITHMS))))
"Unexpected hashing algorithm %s, expected one of %s"
% (algo, ", ".join(sorted(ALGORITHMS)))
)
if algo.endswith('_git'):
if algo.endswith("_git"):
if length is None:
raise ValueError('Missing length for git hashing algorithm')
raise ValueError("Missing length for git hashing algorithm")
base_algo = algo[:-4]
return _new_git_hash(base_algo, 'blob', length)
return hashlib.new(algo)
def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
"""Hash the contents of the given file object with the given algorithms.
Args:
fobj: a file-like object
length: the length of the contents of the file-like object (for the
git-specific algorithms)
algorithms: the hashing algorithms used
Returns: a dict mapping each algorithm to a bytes digest.
Raises:
ValueError if algorithms contains an unknown hash algorithm.
"""
hashes = {algo: _new_hash(algo, length) for algo in algorithms}
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
for hash in hashes.values():
hash.update(chunk)
if chunk_cb:
chunk_cb(chunk)
return {algo: hash.digest() for algo, hash in hashes.items()}
def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
"""Hash the contents of the file at the given path with the given
algorithms.
Args:
path: the path of the file to hash
algorithms: the hashing algorithms used
chunk_cb: a callback
Returns: a dict mapping each algorithm to a bytes digest.
Raises:
ValueError if algorithms contains an unknown hash algorithm.
OSError on file access error
"""
length = os.path.getsize(path)
with open(path, 'rb') as fobj:
hash = hash_file(fobj, length, algorithms, chunk_cb)
hash['length'] = length
return hash
def hash_data(data, algorithms=DEFAULT_ALGORITHMS):
"""Hash the given binary blob with the given algorithms.
Args:
data: a bytes object
algorithms: the hashing algorithms used
Returns: a dict mapping each algorithm to a bytes digest
h = _new_hashlib_hash(base_algo)
h.update(git_object_header("blob", length))
return h
Raises:
TypeError if data does not support the buffer interface.
ValueError if algorithms contains an unknown hash algorithm.
"""
fobj = BytesIO(data)
return hash_file(fobj, len(data), algorithms)
return _new_hashlib_hash(algo)
def hash_git_data(data, git_type, base_algo='sha1'):
def hash_git_data(data, git_type, base_algo="sha1"):
"""Hash the given data as a git object of type git_type.
Args:
......@@ -195,42 +287,67 @@ def hash_git_data(data, git_type, base_algo='sha1'):
Raises:
ValueError if the git_type is unexpected.
"""
git_object_types = {'blob', 'tree', 'commit', 'tag'}
if git_type not in git_object_types:
raise ValueError('Unexpected git object type %s, expected one of %s' %
(git_type, ', '.join(sorted(git_object_types))))
h = _new_git_hash(base_algo, git_type, len(data))
h = _new_hashlib_hash(base_algo)
h.update(git_object_header(git_type, len(data)))
h.update(data)
return h.digest()
@functools.lru_cache()
def hash_to_hex(hash):
"""Converts a hash (in hex or bytes form) to its hexadecimal ascii form"""
def hash_to_hex(hash: Union[str, bytes]) -> str:
"""Converts a hash (in hex or bytes form) to its hexadecimal ascii form
Args:
hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing
the hexadecimal form of the hash
Returns:
str: the hexadecimal form of the hash
"""
if isinstance(hash, str):
return hash
return binascii.hexlify(hash).decode('ascii')
return binascii.hexlify(hash).decode("ascii")
@functools.lru_cache()
def hash_to_bytehex(hash):
"""Converts a hash to its hexadecimal bytes representation"""
def hash_to_bytehex(hash: bytes) -> bytes:
"""Converts a hash to its hexadecimal bytes representation
Args:
hash (bytes): a :class:`bytes` hash
Returns:
bytes: the hexadecimal form of the hash, as :class:`bytes`
"""
return binascii.hexlify(hash)
@functools.lru_cache()
def hash_to_bytes(hash):
"""Converts a hash (in hex or bytes form) to its raw bytes form"""
def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
"""Converts a hash (in hex or bytes form) to its raw bytes form
Args:
hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing
the hexadecimal form of the hash
Returns:
bytes: the :class:`bytes` form of the hash
"""
if isinstance(hash, bytes):
return hash
return bytes.fromhex(hash)
@functools.lru_cache()
def bytehex_to_hash(hex):
"""Converts a hexadecimal bytes representation of a hash to that hash"""
def bytehex_to_hash(hex: bytes) -> bytes:
"""Converts a hexadecimal bytes representation of a hash to that hash
Args:
hash (bytes): a :class:`bytes` containing the hexadecimal form of the
hash encoded in ascii
Returns:
bytes: the :class:`bytes` form of the hash
"""
return hash_to_bytes(hex.decode())
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Marker file for PEP 561.
This diff is collapsed.
File added
File added
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.