diff --git a/PKG-INFO b/PKG-INFO index 92f2105..468d926 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,38 +1,38 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.0.35 +Version: 0.0.36 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-model +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-model ========= Implementation of the Data model of the Software Heritage project, used to archive source code artifacts. This module defines the notion of Persistent Identifier (PID) and provides tools to compute them: ```sh $ swh-identify fork.c kmod.c sched/deadline.c swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c $ swh-identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements.txt b/requirements.txt index cd97184..5962345 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner Click attrs hypothesis +python-dateutil diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 92f2105..468d926 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,38 +1,38 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.0.35 +Version: 0.0.36 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-model +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-model ========= Implementation of the Data model of the Software Heritage project, used to archive source code artifacts. This module defines the notion of Persistent Identifier (PID) and provides tools to compute them: ```sh $ swh-identify fork.c kmod.c sched/deadline.c swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c $ swh-identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index 7ca184a..0ceb6df 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -1,44 +1,45 @@ MANIFEST.in Makefile README.md requirements-swh.txt requirements.txt setup.py version.txt swh/__init__.py swh.model.egg-info/PKG-INFO swh.model.egg-info/SOURCES.txt swh.model.egg-info/dependency_links.txt swh.model.egg-info/entry_points.txt swh.model.egg-info/requires.txt swh.model.egg-info/top_level.txt swh/model/__init__.py swh/model/cli.py swh/model/exceptions.py swh/model/from_disk.py swh/model/hashutil.py swh/model/hypothesis_strategies.py swh/model/identifiers.py swh/model/merkle.py swh/model/model.py swh/model/toposort.py swh/model/validators.py swh/model/fields/__init__.py swh/model/fields/compound.py swh/model/fields/hashes.py swh/model/fields/simple.py swh/model/tests/__init__.py swh/model/tests/generate_testdata_from_disk.py swh/model/tests/test_cli.py swh/model/tests/test_from_disk.py swh/model/tests/test_hashutil.py swh/model/tests/test_hypothesis_strategies.py swh/model/tests/test_identifiers.py swh/model/tests/test_merkle.py +swh/model/tests/test_model.py swh/model/tests/test_toposort.py swh/model/tests/test_validators.py swh/model/tests/data/dir-folders/sample-folder.tgz swh/model/tests/fields/__init__.py swh/model/tests/fields/test_compound.py swh/model/tests/fields/test_hashes.py swh/model/tests/fields/test_simple.py \ No newline at end of file diff --git a/swh.model.egg-info/requires.txt b/swh.model.egg-info/requires.txt index 674d225..af8f5be 100644 --- a/swh.model.egg-info/requires.txt +++ b/swh.model.egg-info/requires.txt @@ -1,7 +1,8 @@ vcversioner Click attrs hypothesis +python-dateutil [testing] pytest diff --git a/swh/model/model.py b/swh/model/model.py index 036879d..25a565b 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -1,247 +1,359 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from enum import Enum from typing import List, Optional, Dict import attr +import dateutil.parser +from .identifiers import normalize_timestamp # TODO: Limit this to 20 bytes Sha1Git = bytes +def contains_optional_validator(validator): + """Inspects an attribute's validator to find its type. + Inspired by `hypothesis/searchstrategy/attrs.py`.""" + if isinstance(validator, attr.validators._OptionalValidator): + return True + elif isinstance(validator, attr.validators._AndValidator): + for validator in validator._validators: + res = contains_optional_validator(validator) + if res: + return True + else: + return False + + +class BaseModel: + """Base class for SWH model classes. + + Provides serialization/deserialization to/from Python dictionaries, + that are suitable for JSON/msgpack-like formats.""" + + def to_dict(self): + """Wrapper of `attr.asdict` that can be overriden by subclasses + that have special handling of some of the fields.""" + return attr.asdict(self) + + @classmethod + def from_dict(cls, d): + """Takes a dictionary representing a tree of SWH objects, and + recursively builds the corresponding objects.""" + if not isinstance(d, dict): + raise TypeError( + '%s.from_dict expects a dict, not %r' % (cls.__name__, d)) + kwargs = {} + for (name, attribute) in attr.fields_dict(cls).items(): + type_ = attribute.type + + # Heuristic to detect `Optional[X]` and unwrap it to `X`. + if contains_optional_validator(attribute.validator): + if name not in d: + continue + if d[name] is None: + continue + else: + type_ = type_.__args__[0] + + # Construct an object of the expected type + if issubclass(type_, BaseModel): + kwargs[name] = type_.from_dict(d[name]) + elif issubclass(type_, Enum): + kwargs[name] = type_(d[name]) + else: + kwargs[name] = d[name] + + return cls(**kwargs) + + @attr.s -class Person: +class Person(BaseModel): + """Represents the author/committer of a revision or release.""" name = attr.ib(type=bytes) email = attr.ib(type=bytes) fullname = attr.ib(type=bytes) @attr.s -class Timestamp: +class Timestamp(BaseModel): + """Represents a naive timestamp from a VCS.""" seconds = attr.ib(type=int) microseconds = attr.ib(type=int) @seconds.validator def check_seconds(self, attribute, value): """Check that seconds fit in a 64-bits signed integer.""" if not (-2**63 <= value < 2**63): raise ValueError('Seconds must be a signed 64-bits integer.') @microseconds.validator def check_microseconds(self, attribute, value): """Checks that microseconds are positive and < 1000000.""" if not (0 <= value < 10**6): raise ValueError('Microseconds must be in [0, 1000000[.') @attr.s -class TimestampWithTimezone: +class TimestampWithTimezone(BaseModel): + """Represents a TZ-aware timestamp from a VCS.""" timestamp = attr.ib(type=Timestamp) offset = attr.ib(type=int) negative_utc = attr.ib(type=bool) - def to_dict(self): - return attr.asdict(self) - @offset.validator def check_offset(self, attribute, value): + """Checks the offset is a 16-bits signed integer (in theory, it + should always be between -14 and +14 hours).""" if not (-2**15 <= value < 2**15): # max 14 hours offset in theory, but you never know what # you'll find in the wild... raise ValueError('offset too large: %d minutes' % value) + @classmethod + def from_dict(cls, d): + """Builds a TimestampWithTimezone from any of the formats + accepted by :py:`swh.model.normalize_timestamp`.""" + return super().from_dict(normalize_timestamp(d)) + @attr.s -class Origin: +class Origin(BaseModel): + """Represents a software source: a VCS and an URL.""" type = attr.ib(type=str) url = attr.ib(type=str) - def to_dict(self): - return attr.asdict(self) - @attr.s -class OriginVisit: +class OriginVisit(BaseModel): + """Represents a visit of an origin at a given point in time, by a + SWH loader.""" origin = attr.ib(type=Origin) date = attr.ib(type=datetime.datetime) - visit = attr.ib(type=Optional[int]) + visit = attr.ib(type=Optional[int], + validator=attr.validators.optional([])) """Should not be set before calling 'origin_visit_add()'.""" def to_dict(self): - ov = attr.asdict(self) - ov['origin'] = self.origin.to_dict() + """Serializes the date as a string and omits the visit id if it is + `None`.""" + ov = super().to_dict() ov['date'] = str(self.date) - if not ov['visit']: + if ov['visit'] is None: del ov['visit'] return ov + @classmethod + def from_dict(cls, d): + """Parses the date from a string, and accepts missing visit ids.""" + return cls( + origin=Origin.from_dict(d['origin']), + date=dateutil.parser.parse(d['date']), + visit=d.get('visit')) + class TargetType(Enum): + """The type of content pointed to by a snapshot branch. Usually a + revision or an alias.""" CONTENT = 'content' DIRECTORY = 'directory' REVISION = 'revision' RELEASE = 'release' SNAPSHOT = 'snapshot' ALIAS = 'alias' class ObjectType(Enum): + """The type of content pointed to by a release. Usually a revision""" CONTENT = 'content' DIRECTORY = 'directory' REVISION = 'revision' RELEASE = 'release' SNAPSHOT = 'snapshot' @attr.s -class SnapshotBranch: +class SnapshotBranch(BaseModel): + """Represents one of the branches of a snapshot.""" target = attr.ib(type=bytes) target_type = attr.ib(type=TargetType) @target.validator def check_target(self, attribute, value): + """Checks the target type is not an alias, checks the target is a + valid sha1_git.""" if self.target_type != TargetType.ALIAS: if len(value) != 20: raise ValueError('Wrong length for bytes identifier: %d' % len(value)) def to_dict(self): branch = attr.asdict(self) branch['target_type'] = branch['target_type'].value return branch @attr.s -class Snapshot: +class Snapshot(BaseModel): + """Represents the full state of an origin at a given point in time.""" id = attr.ib(type=Sha1Git) branches = attr.ib(type=Dict[bytes, Optional[SnapshotBranch]]) def to_dict(self): return { 'id': self.id, 'branches': { name: branch.to_dict() for (name, branch) in self.branches.items() } } + @classmethod + def from_dict(cls, d): + d = { + **d, + 'branches': { + name: SnapshotBranch.from_dict(branch) + for (name, branch) in d['branches'].items() + } + } + return cls(**d) + @attr.s -class Release: +class Release(BaseModel): id = attr.ib(type=Sha1Git) name = attr.ib(type=bytes) message = attr.ib(type=bytes) - date = attr.ib(type=Optional[TimestampWithTimezone]) - author = attr.ib(type=Optional[Person]) - target = attr.ib(type=Optional[Sha1Git]) + target = attr.ib(type=Optional[Sha1Git], + validator=attr.validators.optional([])) target_type = attr.ib(type=ObjectType) synthetic = attr.ib(type=bool) + author = attr.ib(type=Optional[Person], + default=None, + validator=attr.validators.optional([])) + date = attr.ib(type=Optional[TimestampWithTimezone], + default=None, + validator=attr.validators.optional([])) + + @author.validator + def check_author(self, attribute, value): + """If the author is `None`, checks the date is `None` too.""" + if self.author is None and self.date is not None: + raise ValueError('release date must be None if author is None.') def to_dict(self): rel = attr.asdict(self) rel['date'] = self.date.to_dict() if self.date is not None else None rel['target_type'] = rel['target_type'].value return rel - @author.validator - def check_author(self, attribute, value): - if self.author is None and self.date is not None: - raise ValueError('release date must be None if date is None.') - class RevisionType(Enum): GIT = 'git' TAR = 'tar' DSC = 'dsc' SUBVERSION = 'svn' MERCURIAL = 'hg' @attr.s -class Revision: +class Revision(BaseModel): id = attr.ib(type=Sha1Git) message = attr.ib(type=bytes) author = attr.ib(type=Person) committer = attr.ib(type=Person) date = attr.ib(type=TimestampWithTimezone) committer_date = attr.ib(type=TimestampWithTimezone) - parents = attr.ib(type=List[Sha1Git]) type = attr.ib(type=RevisionType) directory = attr.ib(type=Sha1Git) - metadata = attr.ib(type=Optional[Dict[str, object]]) synthetic = attr.ib(type=bool) + metadata = attr.ib(type=Optional[Dict[str, object]], + default=None, + validator=attr.validators.optional([])) + parents = attr.ib(type=List[Sha1Git], + default=attr.Factory(list)) def to_dict(self): rev = attr.asdict(self) rev['date'] = self.date.to_dict() rev['committer_date'] = self.committer_date.to_dict() rev['type'] = rev['type'].value return rev @attr.s -class DirectoryEntry: +class DirectoryEntry(BaseModel): name = attr.ib(type=bytes) type = attr.ib(type=str, validator=attr.validators.in_(['file', 'dir', 'rev'])) target = attr.ib(type=Sha1Git) perms = attr.ib(type=int) """Usually one of the values of `swh.model.from_disk.DentryPerms`.""" - def to_dict(self): - return attr.asdict(self) - @attr.s -class Directory: +class Directory(BaseModel): id = attr.ib(type=Sha1Git) entries = attr.ib(type=List[DirectoryEntry]) def to_dict(self): dir_ = attr.asdict(self) dir_['entries'] = [entry.to_dict() for entry in self.entries] return dir_ + @classmethod + def from_dict(cls, d): + d = { + **d, + 'entries': list(map(DirectoryEntry.from_dict, d['entries'])) + } + return super().from_dict(d) + @attr.s -class Content: +class Content(BaseModel): sha1 = attr.ib(type=bytes) sha1_git = attr.ib(type=Sha1Git) sha256 = attr.ib(type=bytes) blake2s256 = attr.ib(type=bytes) - data = attr.ib(type=bytes) length = attr.ib(type=int) status = attr.ib( type=str, validator=attr.validators.in_(['visible', 'absent', 'hidden'])) - reason = attr.ib(type=Optional[str]) + reason = attr.ib(type=Optional[str], + default=None, + validator=attr.validators.optional([])) + data = attr.ib(type=Optional[bytes], + default=None, + validator=attr.validators.optional([])) @length.validator def check_length(self, attribute, value): """Checks the length is positive.""" if value < 0: raise ValueError('Length must be positive.') @reason.validator def check_reason(self, attribute, value): """Checks the reason is full iff status != absent.""" assert self.reason == value if self.status == 'absent' and value is None: raise ValueError('Must provide a reason if content is absent.') elif self.status != 'absent' and value is not None: raise ValueError( 'Must not provide a reason if content is not absent.') def to_dict(self): content = attr.asdict(self) if content['data'] is None: del content['data'] if content['reason'] is None: del content['reason'] return content diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py new file mode 100644 index 0000000..220ba32 --- /dev/null +++ b/swh/model/tests/test_model.py @@ -0,0 +1,26 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import copy + +from hypothesis import given + +from swh.model.hypothesis_strategies import objects + + +@given(objects()) +def test_todict_inverse_fromdict(objtype_and_obj): + (obj_type, obj) = objtype_and_obj + obj_as_dict = obj.to_dict() + obj_as_dict_copy = copy.deepcopy(obj_as_dict) + + # Check the composition of to_dict and from_dict is the identity + assert obj == type(obj).from_dict(obj_as_dict) + + # Check from_dict() does not change the input dict + assert obj_as_dict == obj_as_dict_copy + + # Check the composition of from_dict and to_dict is the identity + assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() diff --git a/version.txt b/version.txt index 871d5f5..bcfdf8a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.35-0-gfee3a41 \ No newline at end of file +v0.0.36-0-gfc3d3c1 \ No newline at end of file