Newer
Older
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
from abc import ABCMeta, abstractmethod
from enum import Enum
from typing import List, Optional, Dict, Union
import attr
import dateutil.parser
import iso8601
from .identifiers import (
normalize_timestamp, directory_identifier, revision_identifier,
release_identifier, snapshot_identifier
)
from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash

vlorentz
committed
class MissingData(Exception):
"""Raised by `Content.with_data` when it has no way of fetching the
data (but not when fetching the data fails)."""
pass
# TODO: Limit this to 20 bytes
Sha1Git = bytes
class BaseModel:
"""Base class for SWH model classes.
Provides serialization/deserialization to/from Python dictionaries,
that are suitable for JSON/msgpack-like formats."""
def to_dict(self):
"""Wrapper of `attr.asdict` that can be overridden by subclasses
that have special handling of some of the fields."""
def dictify(value):
if isinstance(value, BaseModel):
return value.to_dict()
elif isinstance(value, Enum):
return value.value
elif isinstance(value, dict):
return {k: dictify(v) for k, v in value.items()}
elif isinstance(value, list):
return [dictify(v) for v in value]
else:
return value
ret = attr.asdict(self, recurse=False)
return dictify(ret)
@classmethod
def from_dict(cls, d):
"""Takes a dictionary representing a tree of SWH objects, and
recursively builds the corresponding objects."""
return cls(**d)
class HashableObject(metaclass=ABCMeta):
"""Mixin to automatically compute object identifier hash when
the associated model is instantiated."""
@staticmethod
@abstractmethod
def compute_hash(object_dict):
"""Derived model classes must implement this to compute
the object hash from its dict representation."""
pass
def __attrs_post_init__(self):
if not self.id:
obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
object.__setattr__(self, 'id', obj_id)
class Person(BaseModel):
"""Represents the author/committer of a revision or release."""
fullname = attr.ib(type=bytes)
name = attr.ib(type=Optional[bytes])
email = attr.ib(type=Optional[bytes])
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
@classmethod
def from_fullname(cls, fullname: bytes):
"""Returns a Person object, by guessing the name and email from the
fullname, in the `name <email>` format.
The fullname is left unchanged."""
if fullname is None:
raise TypeError('fullname is None.')
name: Optional[bytes]
email: Optional[bytes]
try:
open_bracket = fullname.index(b'<')
except ValueError:
name = fullname
email = None
else:
raw_name = fullname[:open_bracket]
raw_email = fullname[open_bracket+1:]
if not raw_name:
name = None
else:
name = raw_name.strip()
try:
close_bracket = raw_email.rindex(b'>')
except ValueError:
email = raw_email
else:
email = raw_email[:close_bracket]
return Person(
name=name or None,
email=email or None,
fullname=fullname,
)
class Timestamp(BaseModel):
"""Represents a naive timestamp from a VCS."""
seconds = attr.ib(type=int)
microseconds = attr.ib(type=int)
@seconds.validator
def check_seconds(self, attribute, value):
"""Check that seconds fit in a 64-bits signed integer."""
if not (-2**63 <= value < 2**63):
raise ValueError('Seconds must be a signed 64-bits integer.')
@microseconds.validator
def check_microseconds(self, attribute, value):
"""Checks that microseconds are positive and < 1000000."""
if not (0 <= value < 10**6):
raise ValueError('Microseconds must be in [0, 1000000[.')
class TimestampWithTimezone(BaseModel):
"""Represents a TZ-aware timestamp from a VCS."""
timestamp = attr.ib(type=Timestamp)
offset = attr.ib(type=int)
negative_utc = attr.ib(type=bool)
@offset.validator
def check_offset(self, attribute, value):
"""Checks the offset is a 16-bits signed integer (in theory, it
should always be between -14 and +14 hours)."""
if not (-2**15 <= value < 2**15):
# max 14 hours offset in theory, but you never know what
# you'll find in the wild...
raise ValueError('offset too large: %d minutes' % value)
@classmethod
def from_dict(cls, obj: Union[Dict, datetime.datetime, int]):
"""Builds a TimestampWithTimezone from any of the formats
accepted by :func:`swh.model.normalize_timestamp`."""
# TODO: this accept way more types than just dicts; find a better
# name
d = normalize_timestamp(obj)
return cls(
timestamp=Timestamp.from_dict(d['timestamp']),
offset=d['offset'],
negative_utc=d['negative_utc'])
@classmethod
def from_datetime(cls, dt: datetime.datetime):
return cls.from_dict(dt)
@classmethod
def from_iso8601(cls, s):
"""Builds a TimestampWithTimezone from an ISO8601-formatted string.
"""
dt = iso8601.parse_date(s)
tstz = cls.from_datetime(dt)
if dt.tzname() == '-00:00':
tstz = attr.evolve(tstz, negative_utc=True)
return tstz
class Origin(BaseModel):
"""Represents a software source: a VCS and an URL."""
url = attr.ib(type=str)
def to_dict(self):
r = super().to_dict()
r.pop('type', None)
return r
class OriginVisit(BaseModel):
"""Represents a visit of an origin at a given point in time, by a
SWH loader."""
origin = attr.ib(type=str)
date = attr.ib(type=datetime.datetime)
status = attr.ib(
type=str,
validator=attr.validators.in_(['ongoing', 'full', 'partial']))
type = attr.ib(type=str)
snapshot = attr.ib(type=Optional[Sha1Git])
metadata = attr.ib(type=Optional[Dict[str, object]],
default=None)
visit = attr.ib(type=Optional[int],
default=None)
"""Should not be set before calling 'origin_visit_add()'."""
def to_dict(self):
"""Serializes the date as a string and omits the visit id if it is
`None`."""
ov = super().to_dict()
if ov['visit'] is None:
del ov['visit']
return ov
@classmethod
def from_dict(cls, d):
"""Parses the date from a string, and accepts missing visit ids."""
d = d.copy()
date = d.pop('date')
return cls(
date=(date
if isinstance(date, datetime.datetime)
else dateutil.parser.parse(date)),
class TargetType(Enum):
"""The type of content pointed to by a snapshot branch. Usually a
revision or an alias."""
CONTENT = 'content'
DIRECTORY = 'directory'
REVISION = 'revision'
RELEASE = 'release'
SNAPSHOT = 'snapshot'
ALIAS = 'alias'
class ObjectType(Enum):
"""The type of content pointed to by a release. Usually a revision"""
CONTENT = 'content'
DIRECTORY = 'directory'
REVISION = 'revision'
RELEASE = 'release'
SNAPSHOT = 'snapshot'
class SnapshotBranch(BaseModel):
"""Represents one of the branches of a snapshot."""
target = attr.ib(type=bytes)
target_type = attr.ib(type=TargetType)
@target.validator
def check_target(self, attribute, value):
"""Checks the target type is not an alias, checks the target is a
valid sha1_git."""
if self.target_type != TargetType.ALIAS and self.target is not None:
if len(value) != 20:
raise ValueError('Wrong length for bytes identifier: %d' %
len(value))
@classmethod
def from_dict(cls, d):
return cls(
target=d['target'],
target_type=TargetType(d['target_type']))
class Snapshot(BaseModel, HashableObject):
"""Represents the full state of an origin at a given point in time."""
branches = attr.ib(type=Dict[bytes, Optional[SnapshotBranch]])
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return snapshot_identifier(object_dict)
@classmethod
def from_dict(cls, d):
d = d.copy()
return cls(
branches={
name: SnapshotBranch.from_dict(branch) if branch else None
for (name, branch) in d.pop('branches').items()
},
**d)
class Release(BaseModel, HashableObject):
name = attr.ib(type=bytes)
message = attr.ib(type=bytes)
target_type = attr.ib(type=ObjectType)
synthetic = attr.ib(type=bool)
author = attr.ib(type=Optional[Person],
date = attr.ib(type=Optional[TimestampWithTimezone],
metadata = attr.ib(type=Optional[Dict[str, object]],
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return release_identifier(object_dict)
@author.validator
def check_author(self, attribute, value):
"""If the author is `None`, checks the date is `None` too."""
if self.author is None and self.date is not None:
raise ValueError('release date must be None if author is None.')
def to_dict(self):
if rel['metadata'] is None:
del rel['metadata']
return rel
@classmethod
def from_dict(cls, d):
d = d.copy()
if d.get('author'):
d['author'] = Person.from_dict(d['author'])
if d.get('date'):
d['date'] = TimestampWithTimezone.from_dict(d['date'])
return cls(
target_type=ObjectType(d.pop('target_type')),
**d)
class RevisionType(Enum):
GIT = 'git'
TAR = 'tar'
DSC = 'dsc'
SUBVERSION = 'svn'
MERCURIAL = 'hg'
class Revision(BaseModel, HashableObject):
message = attr.ib(type=bytes)
author = attr.ib(type=Person)
committer = attr.ib(type=Person)
date = attr.ib(type=Optional[TimestampWithTimezone])
committer_date = attr.ib(type=Optional[TimestampWithTimezone])
type = attr.ib(type=RevisionType)
directory = attr.ib(type=Sha1Git)
synthetic = attr.ib(type=bool)
metadata = attr.ib(type=Optional[Dict[str, object]],
parents = attr.ib(type=List[Sha1Git],
default=attr.Factory(list))
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return revision_identifier(object_dict)
@classmethod
def from_dict(cls, d):
date = d.pop('date')
if date:
date = TimestampWithTimezone.from_dict(date)
committer_date = d.pop('committer_date')
if committer_date:
committer_date = TimestampWithTimezone.from_dict(
committer_date)
return cls(
author=Person.from_dict(d.pop('author')),
committer=Person.from_dict(d.pop('committer')),
date=date,
committer_date=committer_date,
type=RevisionType(d.pop('type')),
**d)
class DirectoryEntry(BaseModel):
name = attr.ib(type=bytes)
type = attr.ib(type=str,
validator=attr.validators.in_(['file', 'dir', 'rev']))
target = attr.ib(type=Sha1Git)
perms = attr.ib(type=int)
"""Usually one of the values of `swh.model.from_disk.DentryPerms`."""
class Directory(BaseModel, HashableObject):
entries = attr.ib(type=List[DirectoryEntry])
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return directory_identifier(object_dict)
@classmethod
def from_dict(cls, d):
d = d.copy()
return cls(
entries=[DirectoryEntry.from_dict(entry)
for entry in d.pop('entries')],
**d)
class BaseContent(BaseModel):

vlorentz
committed
status = attr.ib(
type=str,
validator=attr.validators.in_(['visible', 'hidden', 'absent']))
@staticmethod
def _hash_data(data: bytes):
"""Hash some data, returning most of the fields of a content object"""
d = MultiHash.from_data(data).digest()
d['data'] = data
d['length'] = len(data)
return d
def to_dict(self):
content = super().to_dict()
if content['ctime'] is None:
del content['ctime']
return content
@classmethod
def from_dict(cls, d, use_subclass=True):
if use_subclass:
# Chooses a subclass to instantiate instead.
if d['status'] == 'absent':
return SkippedContent.from_dict(d)
else:
return Content.from_dict(d)
else:
return super().from_dict(d)
def get_hash(self, hash_name):
if hash_name not in DEFAULT_ALGORITHMS:
raise ValueError('{} is not a valid hash name.'.format(hash_name))
return getattr(self, hash_name)
def hashes(self) -> Dict[str, bytes]:
"""Returns a dictionary {hash_name: hash_value}"""
return {algo: getattr(self, algo) for algo in DEFAULT_ALGORITHMS}
@attr.s(frozen=True)
class Content(BaseContent):
sha1 = attr.ib(type=bytes)
sha1_git = attr.ib(type=Sha1Git)
sha256 = attr.ib(type=bytes)
blake2s256 = attr.ib(type=bytes)
length = attr.ib(type=int)
status = attr.ib(
type=str,
validator=attr.validators.in_(['visible', 'hidden']))

vlorentz
committed
data = attr.ib(type=Optional[bytes], default=None)
ctime = attr.ib(type=Optional[datetime.datetime],
default=None)
@length.validator
def check_length(self, attribute, value):
"""Checks the length is positive."""
raise ValueError('Length must be positive.')
def to_dict(self):
content = super().to_dict()
if content['data'] is None:
del content['data']
return content
@classmethod
def from_data(cls, data, status='visible') -> 'Content':
"""Generate a Content from a given `data` byte string.
This populates the Content with the hashes and length for the data
passed as argument, as well as the data itself.
"""
d = cls._hash_data(data)
d['status'] = status
return cls(**d)
@classmethod
def from_dict(cls, d):
return super().from_dict(d, use_subclass=False)

vlorentz
committed
def with_data(self) -> 'Content':
"""Loads the `data` attribute; meaning that it is guaranteed not to
be None after this call.
This call is almost a no-op, but subclasses may overload this method
to lazy-load data (eg. from disk or objstorage)."""
if self.data is None:
raise MissingData('Content data is None.')
return self
@attr.s(frozen=True)
class SkippedContent(BaseContent):
sha1 = attr.ib(type=Optional[bytes])
sha1_git = attr.ib(type=Optional[Sha1Git])
sha256 = attr.ib(type=Optional[bytes])
blake2s256 = attr.ib(type=Optional[bytes])

vlorentz
committed
length = attr.ib(type=Optional[int])
status = attr.ib(
type=str,
validator=attr.validators.in_(['absent']))
reason = attr.ib(type=Optional[str],
default=None)
origin = attr.ib(type=Optional[Origin],
default=None)
ctime = attr.ib(type=Optional[datetime.datetime],
default=None)
@reason.validator
def check_reason(self, attribute, value):
"""Checks the reason is full if status != absent."""
assert self.reason == value
if value is None:
raise ValueError('Must provide a reason if content is absent.')
@length.validator
def check_length(self, attribute, value):
"""Checks the length is positive or -1."""
raise ValueError('Length must be positive or -1.')
def to_dict(self):
content = super().to_dict()
if content['origin'] is None:
del content['origin']
return content
@classmethod
def from_data(cls, data, reason: str) -> 'SkippedContent':
"""Generate a SkippedContent from a given `data` byte string.
This populates the SkippedContent with the hashes and length for the
data passed as argument.
You can use `attr.evolve` on such a generated content to nullify some
of its attributes, e.g. for tests.
"""
d = cls._hash_data(data)
del d['data']
d['status'] = 'absent'
d['reason'] = reason
return cls(**d)
@classmethod
def from_dict(cls, d):
d2 = d
d = d.copy()
if d.pop('data', None) is not None:
raise ValueError('SkippedContent has no "data" attribute %r' % d2)
return super().from_dict(d, use_subclass=False)