Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
swh-model
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Renaud Boyer
swh-model
Commits
59742ac0
Commit
59742ac0
authored
9 years ago
by
Nicolas Dandrimont
Browse files
Options
Downloads
Patches
Plain Diff
swh.model.hashutil: Add hashing utilities to swh.model
parent
76eb3640
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
swh/model/hashutil.py
+147
-0
147 additions, 0 deletions
swh/model/hashutil.py
swh/model/tests/test_hashutil.py
+75
-0
75 additions, 0 deletions
swh/model/tests/test_hashutil.py
with
222 additions
and
0 deletions
swh/model/hashutil.py
0 → 100644
+
147
−
0
View file @
59742ac0
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
hashlib
from
io
import
BytesIO
# supported hashing algorithms
ALGORITHMS
=
set
([
'
sha1
'
,
'
sha256
'
,
'
sha1_git
'
])
# should be a multiple of 64 (sha1/sha256's block size)
# FWIW coreutils' sha1sum uses 32768
HASH_BLOCK_SIZE
=
32768
def
_new_git_hash
(
base_algo
,
git_type
,
length
):
"""
Initialize a digest object (as returned by python
'
s hashlib) for the
requested algorithm, and feed it with the header for a git object of the
given type and length.
The header for hashing a git object consists of:
- The type of the object (encoded in ASCII)
- One ASCII space (
\x20
)
- The length of the object (decimal encoded in ASCII)
- One NUL byte
Args:
base_algo: a hashlib-supported algorithm
git_type: the type of the git object (supposedly one of
'
blob
'
,
'
commit
'
,
'
tag
'
,
'
tree
'
)
length: the length of the git object you
'
re encoding
Returns:
a hashutil.hash object
"""
h
=
hashlib
.
new
(
base_algo
)
git_header
=
'
%s %d
\0
'
%
(
git_type
,
length
)
h
.
update
(
git_header
.
encode
(
'
ascii
'
))
return
h
def
_new_hash
(
algo
,
length
=
None
):
"""
Initialize a digest object (as returned by python
'
s hashlib) for the
requested algorithm. See the constant ALGORITHMS for the list of supported
algorithms. If a git-specific hashing algorithm is requested (e.g.,
"
sha1_git
"
), the hashing object will be pre-fed with the needed header; for
this to work, length must be given.
Args:
algo: a hashing algorithm (one of ALGORITHMS)
length: the length of the hashed payload (needed for git-specific
algorithms)
Returns:
a hashutil.hash object
Raises:
ValueError if algo is unknown, or length is missing for a git-specific
hash.
"""
if
algo
not
in
ALGORITHMS
:
raise
ValueError
(
'
Unexpected hashing algorithm %s,
'
'
expected one of %s
'
%
(
algo
,
'
,
'
.
join
(
sorted
(
ALGORITHMS
))))
h
=
None
if
algo
.
endswith
(
'
_git
'
):
if
length
is
None
:
raise
ValueError
(
'
Missing length for git hashing algorithm
'
)
base_algo
=
algo
[:
-
4
]
h
=
_new_git_hash
(
base_algo
,
'
blob
'
,
length
)
else
:
h
=
hashlib
.
new
(
algo
)
return
h
def
hash_file
(
fobj
,
length
=
None
,
algorithms
=
ALGORITHMS
):
"""
Hash the contents of the given file object with the given algorithms.
Args:
fobj: a file-like object
length: the length of the contents of the file-like object (for the
git-specific algorithms)
algorithms: the hashing algorithms used
Returns: a dict mapping each algorithm to a hexadecimal digest
Raises:
ValueError if algorithms contains an unknown hash algorithm.
"""
hashes
=
{
algo
:
_new_hash
(
algo
,
length
)
for
algo
in
algorithms
}
while
True
:
chunk
=
fobj
.
read
(
HASH_BLOCK_SIZE
)
if
not
chunk
:
break
for
hash
in
hashes
.
values
():
hash
.
update
(
chunk
)
return
{
algo
:
hash
.
hexdigest
()
for
algo
,
hash
in
hashes
.
items
()}
def
hash_data
(
data
,
algorithms
=
ALGORITHMS
):
"""
Hash the given binary blob with the given algorithms.
Args:
data: a bytes object
algorithms: the hashing algorithms used
Returns: a dict mapping each algorithm to a hexadecimal digest
Raises:
TypeError if data does not support the buffer interface.
ValueError if algorithms contains an unknown hash algorithm.
"""
fobj
=
BytesIO
(
data
)
return
hash_file
(
fobj
,
len
(
data
),
algorithms
)
def
hash_git_data
(
data
,
git_type
,
base_algo
=
'
sha1
'
):
"""
Hash the given data as a git object of type git_type.
Args:
data: a bytes object
git_type: the git object type
base_algo: the base hashing algorithm used (default: sha1)
Returns: a dict mapping each algorithm to a hexadecimal digest
Raises:
ValueError if the git_type is unexpected.
"""
git_object_types
=
{
'
blob
'
,
'
tree
'
,
'
commit
'
,
'
tag
'
}
if
git_type
not
in
git_object_types
:
raise
ValueError
(
'
Unexpected git object type %s, expected one of %s
'
%
(
git_type
,
'
,
'
.
join
(
sorted
(
git_object_types
))))
h
=
_new_git_hash
(
base_algo
,
git_type
,
len
(
data
))
h
.
update
(
data
)
return
h
.
hexdigest
()
This diff is collapsed.
Click to expand it.
swh/model/tests/test_hashutil.py
0 → 100644
+
75
−
0
View file @
59742ac0
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
io
import
unittest
from
nose.tools
import
istest
from
swh.model
import
hashutil
class
Hashutil
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
data
=
b
'
1984
\n
'
self
.
hex_checksums
=
{
'
sha1
'
:
'
62be35bf00ff0c624f4a621e2ea5595a049e0731
'
,
'
sha1_git
'
:
'
568aaf43d83b2c3df8067f3bedbb97d83260be6d
'
,
'
sha256
'
:
'
26602113b4b9afd9d55466b08580d3c2
'
'
4a9b50ee5b5866c0d91fab0e65907311
'
,
}
self
.
git_checksums
=
{
'
blob
'
:
self
.
hex_checksums
[
'
sha1_git
'
],
'
tree
'
:
'
5b2e883aa33d2efab98442693ea4dd5f1b8871b0
'
,
'
commit
'
:
'
79e4093542e72f0fcb7cbd75cb7d270f9254aa8f
'
,
'
tag
'
:
'
d6bf62466f287b4d986c545890716ce058bddf67
'
,
}
@istest
def
hash_data
(
self
):
checksums
=
hashutil
.
hash_data
(
self
.
data
)
self
.
assertEqual
(
checksums
,
self
.
hex_checksums
)
@istest
def
hash_data_unknown_hash
(
self
):
with
self
.
assertRaises
(
ValueError
)
as
cm
:
hashutil
.
hash_data
(
self
.
data
,
[
'
unknown-hash
'
])
self
.
assertIn
(
'
Unexpected hashing algorithm
'
,
cm
.
exception
.
args
[
0
])
self
.
assertIn
(
'
unknown-hash
'
,
cm
.
exception
.
args
[
0
])
@istest
def
hash_git_data
(
self
):
checksums
=
{
git_type
:
hashutil
.
hash_git_data
(
self
.
data
,
git_type
)
for
git_type
in
self
.
git_checksums
}
self
.
assertEqual
(
checksums
,
self
.
git_checksums
)
@istest
def
hash_git_data_unknown_git_type
(
self
):
with
self
.
assertRaises
(
ValueError
)
as
cm
:
hashutil
.
hash_git_data
(
self
.
data
,
'
unknown-git-type
'
)
self
.
assertIn
(
'
Unexpected git object type
'
,
cm
.
exception
.
args
[
0
])
self
.
assertIn
(
'
unknown-git-type
'
,
cm
.
exception
.
args
[
0
])
@istest
def
hash_file
(
self
):
fobj
=
io
.
BytesIO
(
self
.
data
)
checksums
=
hashutil
.
hash_file
(
fobj
,
length
=
len
(
self
.
data
))
self
.
assertEqual
(
checksums
,
self
.
hex_checksums
)
@istest
def
hash_file_missing_length
(
self
):
fobj
=
io
.
BytesIO
(
self
.
data
)
with
self
.
assertRaises
(
ValueError
)
as
cm
:
hashutil
.
hash_file
(
fobj
,
algorithms
=
[
'
sha1_git
'
])
self
.
assertIn
(
'
Missing length
'
,
cm
.
exception
.
args
[
0
])
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment