diff --git a/swh/model/model.py b/swh/model/model.py index 4f0810f..a3809f9 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -1,548 +1,587 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from abc import ABCMeta, abstractmethod from enum import Enum from typing import List, Optional, Dict, Union import attr import dateutil.parser import iso8601 from .identifiers import ( normalize_timestamp, directory_identifier, revision_identifier, release_identifier, snapshot_identifier ) from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash class MissingData(Exception): """Raised by `Content.with_data` when it has no way of fetching the data (but not when fetching the data fails).""" pass SHA1_SIZE = 20 # TODO: Limit this to 20 bytes Sha1Git = bytes class BaseModel: """Base class for SWH model classes. Provides serialization/deserialization to/from Python dictionaries, that are suitable for JSON/msgpack-like formats.""" def to_dict(self): """Wrapper of `attr.asdict` that can be overridden by subclasses that have special handling of some of the fields.""" def dictify(value): if isinstance(value, BaseModel): return value.to_dict() elif isinstance(value, Enum): return value.value elif isinstance(value, dict): return {k: dictify(v) for k, v in value.items()} elif isinstance(value, list): return [dictify(v) for v in value] else: return value ret = attr.asdict(self, recurse=False) return dictify(ret) @classmethod def from_dict(cls, d): """Takes a dictionary representing a tree of SWH objects, and recursively builds the corresponding objects.""" return cls(**d) class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when the associated model is instantiated.""" @staticmethod @abstractmethod def compute_hash(object_dict): """Derived model classes must implement this to compute the object hash from its dict representation.""" pass def __attrs_post_init__(self): if not self.id: obj_id = hash_to_bytes(self.compute_hash(self.to_dict())) object.__setattr__(self, 'id', obj_id) @attr.s(frozen=True) class Person(BaseModel): """Represents the author/committer of a revision or release.""" fullname = attr.ib(type=bytes) name = attr.ib(type=Optional[bytes]) email = attr.ib(type=Optional[bytes]) + @classmethod + def from_fullname(cls, fullname: bytes): + """Returns a Person object, by guessing the name and email from the + fullname, in the `name ` format. + + The fullname is left unchanged.""" + if fullname is None: + raise TypeError('fullname is None.') + + name: Optional[bytes] + email: Optional[bytes] + + try: + open_bracket = fullname.index(b'<') + except ValueError: + name = fullname + email = None + else: + raw_name = fullname[:open_bracket] + raw_email = fullname[open_bracket+1:] + + if not raw_name: + name = None + else: + name = raw_name.strip() + + try: + close_bracket = raw_email.rindex(b'>') + except ValueError: + email = raw_email + else: + email = raw_email[:close_bracket] + + return Person( + name=name or None, + email=email or None, + fullname=fullname, + ) + @attr.s(frozen=True) class Timestamp(BaseModel): """Represents a naive timestamp from a VCS.""" seconds = attr.ib(type=int) microseconds = attr.ib(type=int) @seconds.validator def check_seconds(self, attribute, value): """Check that seconds fit in a 64-bits signed integer.""" if not (-2**63 <= value < 2**63): raise ValueError('Seconds must be a signed 64-bits integer.') @microseconds.validator def check_microseconds(self, attribute, value): """Checks that microseconds are positive and < 1000000.""" if not (0 <= value < 10**6): raise ValueError('Microseconds must be in [0, 1000000[.') @attr.s(frozen=True) class TimestampWithTimezone(BaseModel): """Represents a TZ-aware timestamp from a VCS.""" timestamp = attr.ib(type=Timestamp) offset = attr.ib(type=int) negative_utc = attr.ib(type=bool) @offset.validator def check_offset(self, attribute, value): """Checks the offset is a 16-bits signed integer (in theory, it should always be between -14 and +14 hours).""" if not (-2**15 <= value < 2**15): # max 14 hours offset in theory, but you never know what # you'll find in the wild... raise ValueError('offset too large: %d minutes' % value) @classmethod def from_dict(cls, obj: Union[Dict, datetime.datetime, int]): """Builds a TimestampWithTimezone from any of the formats accepted by :func:`swh.model.normalize_timestamp`.""" # TODO: this accept way more types than just dicts; find a better # name d = normalize_timestamp(obj) return cls( timestamp=Timestamp.from_dict(d['timestamp']), offset=d['offset'], negative_utc=d['negative_utc']) @classmethod def from_datetime(cls, dt: datetime.datetime): return cls.from_dict(dt) @classmethod def from_iso8601(cls, s): """Builds a TimestampWithTimezone from an ISO8601-formatted string. """ dt = iso8601.parse_date(s) tstz = cls.from_datetime(dt) if dt.tzname() == '-00:00': tstz = attr.evolve(tstz, negative_utc=True) return tstz @attr.s(frozen=True) class Origin(BaseModel): """Represents a software source: a VCS and an URL.""" url = attr.ib(type=str) type = attr.ib(type=Optional[str], default=None) def to_dict(self): r = super().to_dict() r.pop('type', None) return r @attr.s(frozen=True) class OriginVisit(BaseModel): """Represents a visit of an origin at a given point in time, by a SWH loader.""" origin = attr.ib(type=str) date = attr.ib(type=datetime.datetime) status = attr.ib( type=str, validator=attr.validators.in_(['ongoing', 'full', 'partial'])) type = attr.ib(type=str) snapshot = attr.ib(type=Optional[Sha1Git]) metadata = attr.ib(type=Optional[Dict[str, object]], default=None) visit = attr.ib(type=Optional[int], default=None) """Should not be set before calling 'origin_visit_add()'.""" def to_dict(self): """Serializes the date as a string and omits the visit id if it is `None`.""" ov = super().to_dict() if ov['visit'] is None: del ov['visit'] return ov @classmethod def from_dict(cls, d): """Parses the date from a string, and accepts missing visit ids.""" d = d.copy() date = d.pop('date') return cls( date=(date if isinstance(date, datetime.datetime) else dateutil.parser.parse(date)), **d) class TargetType(Enum): """The type of content pointed to by a snapshot branch. Usually a revision or an alias.""" CONTENT = 'content' DIRECTORY = 'directory' REVISION = 'revision' RELEASE = 'release' SNAPSHOT = 'snapshot' ALIAS = 'alias' class ObjectType(Enum): """The type of content pointed to by a release. Usually a revision""" CONTENT = 'content' DIRECTORY = 'directory' REVISION = 'revision' RELEASE = 'release' SNAPSHOT = 'snapshot' @attr.s(frozen=True) class SnapshotBranch(BaseModel): """Represents one of the branches of a snapshot.""" target = attr.ib(type=bytes) target_type = attr.ib(type=TargetType) @target.validator def check_target(self, attribute, value): """Checks the target type is not an alias, checks the target is a valid sha1_git.""" if self.target_type != TargetType.ALIAS and self.target is not None: if len(value) != 20: raise ValueError('Wrong length for bytes identifier: %d' % len(value)) @classmethod def from_dict(cls, d): return cls( target=d['target'], target_type=TargetType(d['target_type'])) @attr.s(frozen=True) class Snapshot(BaseModel, HashableObject): """Represents the full state of an origin at a given point in time.""" branches = attr.ib(type=Dict[bytes, Optional[SnapshotBranch]]) id = attr.ib(type=Sha1Git, default=b'') @staticmethod def compute_hash(object_dict): return snapshot_identifier(object_dict) @classmethod def from_dict(cls, d): d = d.copy() return cls( branches={ name: SnapshotBranch.from_dict(branch) if branch else None for (name, branch) in d.pop('branches').items() }, **d) @attr.s(frozen=True) class Release(BaseModel, HashableObject): name = attr.ib(type=bytes) message = attr.ib(type=bytes) target = attr.ib(type=Optional[Sha1Git]) target_type = attr.ib(type=ObjectType) synthetic = attr.ib(type=bool) author = attr.ib(type=Optional[Person], default=None) date = attr.ib(type=Optional[TimestampWithTimezone], default=None) metadata = attr.ib(type=Optional[Dict[str, object]], default=None) id = attr.ib(type=Sha1Git, default=b'') @staticmethod def compute_hash(object_dict): return release_identifier(object_dict) @author.validator def check_author(self, attribute, value): """If the author is `None`, checks the date is `None` too.""" if self.author is None and self.date is not None: raise ValueError('release date must be None if author is None.') def to_dict(self): rel = super().to_dict() if rel['metadata'] is None: del rel['metadata'] return rel @classmethod def from_dict(cls, d): d = d.copy() if d.get('author'): d['author'] = Person.from_dict(d['author']) if d.get('date'): d['date'] = TimestampWithTimezone.from_dict(d['date']) return cls( target_type=ObjectType(d.pop('target_type')), **d) class RevisionType(Enum): GIT = 'git' TAR = 'tar' DSC = 'dsc' SUBVERSION = 'svn' MERCURIAL = 'hg' @attr.s(frozen=True) class Revision(BaseModel, HashableObject): message = attr.ib(type=bytes) author = attr.ib(type=Person) committer = attr.ib(type=Person) date = attr.ib(type=Optional[TimestampWithTimezone]) committer_date = attr.ib(type=Optional[TimestampWithTimezone]) type = attr.ib(type=RevisionType) directory = attr.ib(type=Sha1Git) synthetic = attr.ib(type=bool) metadata = attr.ib(type=Optional[Dict[str, object]], default=None) parents = attr.ib(type=List[Sha1Git], default=attr.Factory(list)) id = attr.ib(type=Sha1Git, default=b'') @staticmethod def compute_hash(object_dict): return revision_identifier(object_dict) @classmethod def from_dict(cls, d): d = d.copy() date = d.pop('date') if date: date = TimestampWithTimezone.from_dict(date) committer_date = d.pop('committer_date') if committer_date: committer_date = TimestampWithTimezone.from_dict( committer_date) return cls( author=Person.from_dict(d.pop('author')), committer=Person.from_dict(d.pop('committer')), date=date, committer_date=committer_date, type=RevisionType(d.pop('type')), **d) @attr.s(frozen=True) class DirectoryEntry(BaseModel): name = attr.ib(type=bytes) type = attr.ib(type=str, validator=attr.validators.in_(['file', 'dir', 'rev'])) target = attr.ib(type=Sha1Git) perms = attr.ib(type=int) """Usually one of the values of `swh.model.from_disk.DentryPerms`.""" @attr.s(frozen=True) class Directory(BaseModel, HashableObject): entries = attr.ib(type=List[DirectoryEntry]) id = attr.ib(type=Sha1Git, default=b'') @staticmethod def compute_hash(object_dict): return directory_identifier(object_dict) @classmethod def from_dict(cls, d): d = d.copy() return cls( entries=[DirectoryEntry.from_dict(entry) for entry in d.pop('entries')], **d) @attr.s(frozen=True) class BaseContent(BaseModel): status = attr.ib( type=str, validator=attr.validators.in_(['visible', 'hidden', 'absent'])) @staticmethod def _hash_data(data: bytes): """Hash some data, returning most of the fields of a content object""" d = MultiHash.from_data(data).digest() d['data'] = data d['length'] = len(data) return d def to_dict(self): content = super().to_dict() if content['ctime'] is None: del content['ctime'] return content @classmethod def from_dict(cls, d, use_subclass=True): if use_subclass: # Chooses a subclass to instantiate instead. if d['status'] == 'absent': return SkippedContent.from_dict(d) else: return Content.from_dict(d) else: return super().from_dict(d) def get_hash(self, hash_name): if hash_name not in DEFAULT_ALGORITHMS: raise ValueError('{} is not a valid hash name.'.format(hash_name)) return getattr(self, hash_name) def hashes(self) -> Dict[str, bytes]: """Returns a dictionary {hash_name: hash_value}""" return {algo: getattr(self, algo) for algo in DEFAULT_ALGORITHMS} @attr.s(frozen=True) class Content(BaseContent): sha1 = attr.ib(type=bytes) sha1_git = attr.ib(type=Sha1Git) sha256 = attr.ib(type=bytes) blake2s256 = attr.ib(type=bytes) length = attr.ib(type=int) status = attr.ib( type=str, default='visible', validator=attr.validators.in_(['visible', 'hidden'])) data = attr.ib(type=Optional[bytes], default=None) ctime = attr.ib(type=Optional[datetime.datetime], default=None) @length.validator def check_length(self, attribute, value): """Checks the length is positive.""" if value < 0: raise ValueError('Length must be positive.') def to_dict(self): content = super().to_dict() if content['data'] is None: del content['data'] return content @classmethod def from_data(cls, data, status='visible') -> 'Content': """Generate a Content from a given `data` byte string. This populates the Content with the hashes and length for the data passed as argument, as well as the data itself. """ d = cls._hash_data(data) d['status'] = status return cls(**d) @classmethod def from_dict(cls, d): return super().from_dict(d, use_subclass=False) def with_data(self) -> 'Content': """Loads the `data` attribute; meaning that it is guaranteed not to be None after this call. This call is almost a no-op, but subclasses may overload this method to lazy-load data (eg. from disk or objstorage).""" if self.data is None: raise MissingData('Content data is None.') return self @attr.s(frozen=True) class SkippedContent(BaseContent): sha1 = attr.ib(type=Optional[bytes]) sha1_git = attr.ib(type=Optional[Sha1Git]) sha256 = attr.ib(type=Optional[bytes]) blake2s256 = attr.ib(type=Optional[bytes]) length = attr.ib(type=Optional[int]) status = attr.ib( type=str, validator=attr.validators.in_(['absent'])) reason = attr.ib(type=Optional[str], default=None) origin = attr.ib(type=Optional[Origin], default=None) ctime = attr.ib(type=Optional[datetime.datetime], default=None) @reason.validator def check_reason(self, attribute, value): """Checks the reason is full if status != absent.""" assert self.reason == value if value is None: raise ValueError('Must provide a reason if content is absent.') @length.validator def check_length(self, attribute, value): """Checks the length is positive or -1.""" if value < -1: raise ValueError('Length must be positive or -1.') def to_dict(self): content = super().to_dict() if content['origin'] is None: del content['origin'] return content @classmethod def from_data(cls, data, reason: str) -> 'SkippedContent': """Generate a SkippedContent from a given `data` byte string. This populates the SkippedContent with the hashes and length for the data passed as argument. You can use `attr.evolve` on such a generated content to nullify some of its attributes, e.g. for tests. """ d = cls._hash_data(data) del d['data'] d['status'] = 'absent' d['reason'] = reason return cls(**d) @classmethod def from_dict(cls, d): d2 = d d = d.copy() if d.pop('data', None) is not None: raise ValueError('SkippedContent has no "data" attribute %r' % d2) return super().from_dict(d, use_subclass=False) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index be3219d..8bffa80 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -1,220 +1,310 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import datetime from hypothesis import given from hypothesis.strategies import binary import pytest from swh.model.model import ( Content, SkippedContent, Directory, Revision, Release, Snapshot, Timestamp, TimestampWithTimezone, - MissingData, + MissingData, Person ) from swh.model.hashutil import hash_to_bytes, MultiHash from swh.model.hypothesis_strategies import objects, origins, origin_visits from swh.model.identifiers import ( directory_identifier, revision_identifier, release_identifier, snapshot_identifier ) from swh.model.tests.test_identifiers import ( directory_example, revision_example, release_example, snapshot_example ) @given(objects()) def test_todict_inverse_fromdict(objtype_and_obj): (obj_type, obj) = objtype_and_obj if obj_type in ('origin', 'origin_visit'): return obj_as_dict = obj.to_dict() obj_as_dict_copy = copy.deepcopy(obj_as_dict) # Check the composition of to_dict and from_dict is the identity assert obj == type(obj).from_dict(obj_as_dict) # Check from_dict() does not change the input dict assert obj_as_dict == obj_as_dict_copy # Check the composition of from_dict and to_dict is the identity assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() @given(origins()) def test_todict_origins(origin): obj = origin.to_dict() assert 'type' not in obj assert type(origin)(url=origin.url) == type(origin).from_dict(obj) @given(origin_visits()) def test_todict_origin_visits(origin_visit): obj = origin_visit.to_dict() assert origin_visit == type(origin_visit).from_dict(obj) def test_timestampwithtimezone_from_datetime(): tz = datetime.timezone(datetime.timedelta(minutes=+60)) date = datetime.datetime( 2020, 2, 27, 14, 39, 19, tzinfo=tz) tstz = TimestampWithTimezone.from_datetime(date) assert tstz == TimestampWithTimezone( timestamp=Timestamp( seconds=1582810759, microseconds=0, ), offset=60, negative_utc=False, ) def test_timestampwithtimezone_from_iso8601(): date = '2020-02-27 14:39:19.123456+0100' tstz = TimestampWithTimezone.from_iso8601(date) assert tstz == TimestampWithTimezone( timestamp=Timestamp( seconds=1582810759, microseconds=123456, ), offset=60, negative_utc=False, ) def test_timestampwithtimezone_from_iso8601_negative_utc(): date = '2020-02-27 13:39:19-0000' tstz = TimestampWithTimezone.from_iso8601(date) assert tstz == TimestampWithTimezone( timestamp=Timestamp( seconds=1582810759, microseconds=0, ), offset=0, negative_utc=True, ) +def test_person_from_fullname(): + """The author should have name, email and fullname filled. + + """ + actual_person = Person.from_fullname(b'tony ') + assert actual_person == Person( + fullname=b'tony ', + name=b'tony', + email=b'ynot@dagobah', + ) + + +def test_person_from_fullname_no_email(): + """The author and fullname should be the same as the input (author). + + """ + actual_person = Person.from_fullname(b'tony') + assert actual_person == Person( + fullname=b'tony', + name=b'tony', + email=None, + ) + + +def test_person_from_fullname_empty_person(): + """Empty person has only its fullname filled with the empty + byte-string. + + """ + actual_person = Person.from_fullname(b'') + assert actual_person == Person( + fullname=b'', + name=None, + email=None, + ) + + +def test_git_author_line_to_author(): + # edge case out of the way + with pytest.raises(TypeError): + Person.from_fullname(None) + + tests = { + b'a ': Person( + name=b'a', + email=b'b@c.com', + fullname=b'a ', + ), + b'': Person( + name=None, + email=b'foo@bar.com', + fullname=b'', + ), + b'malformed ': Person( + name=b'malformed', + email=b'"', + ), + b'trailing ': Person( + name=b'trailing', + email=b'sp@c.e', + fullname=b'trailing ', + ), + b'no': Person( + name=b'no', + email=b'sp@c.e', + fullname=b'no', + ), + b' more ': Person( + name=b'more', + email=b'sp@c.es', + fullname=b' more ', + ), + b' <>': Person( + name=None, + email=None, + fullname=b' <>', + ), + } + + for person in sorted(tests): + expected_person = tests[person] + assert expected_person == Person.from_fullname(person) + + def test_content_get_hash(): hashes = dict( sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') c = Content(length=42, status='visible', **hashes) for (hash_name, hash_) in hashes.items(): assert c.get_hash(hash_name) == hash_ def test_content_hashes(): hashes = dict( sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') c = Content(length=42, status='visible', **hashes) assert c.hashes() == hashes def test_content_data(): c = Content( length=42, status='visible', data=b'foo', sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') assert c.with_data() == c def test_content_data_missing(): c = Content( length=42, status='visible', sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') with pytest.raises(MissingData): c.with_data() @given(binary(max_size=4096)) def test_content_from_data(data): c = Content.from_data(data) assert c.data == data assert c.length == len(data) assert c.status == 'visible' for key, value in MultiHash.from_data(data).digest().items(): assert getattr(c, key) == value @given(binary(max_size=4096)) def test_hidden_content_from_data(data): c = Content.from_data(data, status='hidden') assert c.data == data assert c.length == len(data) assert c.status == 'hidden' for key, value in MultiHash.from_data(data).digest().items(): assert getattr(c, key) == value @given(binary(max_size=4096)) def test_skipped_content_from_data(data): c = SkippedContent.from_data(data, reason='reason') assert c.reason == 'reason' assert c.length == len(data) assert c.status == 'absent' for key, value in MultiHash.from_data(data).digest().items(): assert getattr(c, key) == value def test_directory_model_id_computation(): dir_dict = dict(directory_example) del dir_dict['id'] dir_id = hash_to_bytes(directory_identifier(dir_dict)) for dir_model in [Directory(**dir_dict), Directory.from_dict(dir_dict)]: assert dir_model.id == dir_id def test_revision_model_id_computation(): rev_dict = dict(revision_example) del rev_dict['id'] rev_id = hash_to_bytes(revision_identifier(rev_dict)) for rev_model in [Revision(**rev_dict), Revision.from_dict(rev_dict)]: assert rev_model.id == rev_id def test_revision_model_id_computation_with_no_date(): """We can have revision with date to None """ rev_dict = dict(revision_example) rev_dict['date'] = None rev_dict['committer_date'] = None del rev_dict['id'] rev_id = hash_to_bytes(revision_identifier(rev_dict)) for rev_model in [Revision(**rev_dict), Revision.from_dict(rev_dict)]: assert rev_model.date is None assert rev_model.committer_date is None assert rev_model.id == rev_id def test_release_model_id_computation(): rel_dict = dict(release_example) del rel_dict['id'] rel_id = hash_to_bytes(release_identifier(rel_dict)) for rel_model in [Release(**rel_dict), Release.from_dict(rel_dict)]: assert rel_model.id == hash_to_bytes(rel_id) def test_snapshot_model_id_computation(): snp_dict = dict(snapshot_example) del snp_dict['id'] snp_id = hash_to_bytes(snapshot_identifier(snp_dict)) for snp_model in [Snapshot(**snp_dict), Snapshot.from_dict(snp_dict)]: assert snp_model.id == snp_id