Changeset View
Changeset View
Standalone View
Standalone View
swh/model/model.py
# Copyright (C) 2018-2020 The Software Heritage developers | # Copyright (C) 2018-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
from abc import ABCMeta, abstractmethod | from abc import ABCMeta, abstractmethod | ||||
from copy import deepcopy | from copy import deepcopy | ||||
from enum import Enum | from enum import Enum | ||||
from hashlib import sha256 | from hashlib import sha256 | ||||
from typing import Dict, Iterable, Optional, Tuple, TypeVar, Union | from typing import Any, Dict, Iterable, Optional, Tuple, TypeVar, Union | ||||
from typing_extensions import Final | from typing_extensions import Final | ||||
import attr | import attr | ||||
from attrs_strict import type_validator | from attrs_strict import type_validator | ||||
import dateutil.parser | import dateutil.parser | ||||
import iso8601 | import iso8601 | ||||
from .identifiers import ( | from .identifiers import ( | ||||
normalize_timestamp, | normalize_timestamp, | ||||
directory_identifier, | directory_identifier, | ||||
revision_identifier, | revision_identifier, | ||||
release_identifier, | release_identifier, | ||||
snapshot_identifier, | snapshot_identifier, | ||||
SWHID, | |||||
) | ) | ||||
from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash | from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash | ||||
class MissingData(Exception): | class MissingData(Exception): | ||||
"""Raised by `Content.with_data` when it has no way of fetching the | """Raised by `Content.with_data` when it has no way of fetching the | ||||
data (but not when fetching the data fails).""" | data (but not when fetching the data fails).""" | ||||
▲ Show 20 Lines • Show All 657 Lines • ▼ Show 20 Lines | ) -> "SkippedContent": | ||||
return cls(**d) | return cls(**d) | ||||
@classmethod | @classmethod | ||||
def from_dict(cls, d): | def from_dict(cls, d): | ||||
d2 = d.copy() | d2 = d.copy() | ||||
if d2.pop("data", None) is not None: | if d2.pop("data", None) is not None: | ||||
raise ValueError('SkippedContent has no "data" attribute %r' % d) | raise ValueError('SkippedContent has no "data" attribute %r' % d) | ||||
return super().from_dict(d2, use_subclass=False) | return super().from_dict(d2, use_subclass=False) | ||||
class MetadataAuthorityType(Enum): | |||||
DEPOSIT = "deposit" | |||||
FORGE = "forge" | |||||
REGISTRY = "registry" | |||||
@attr.s(frozen=True) | |||||
class MetadataAuthority(BaseModel): | |||||
"""Represents an entity that provides metadata about an origin or | |||||
software artifact.""" | |||||
type = attr.ib(type=MetadataAuthorityType, validator=type_validator()) | |||||
url = attr.ib(type=str, validator=type_validator()) | |||||
douardda: can't we directly take the path of immutability while at it here, e.g. using a Tuple[Tuple… | |||||
Done Inline Actionsok vlorentz: ok | |||||
metadata = attr.ib( | |||||
type=Optional[Dict[str, Any]], default=None, validator=type_validator() | |||||
) | |||||
@attr.s(frozen=True) | |||||
class MetadataFetcher(BaseModel): | |||||
"""Represents a software component used to fetch metadata from a metadata | |||||
authority, and ingest them into the Software Heritage archive.""" | |||||
name = attr.ib(type=str, validator=type_validator()) | |||||
version = attr.ib(type=str, validator=type_validator()) | |||||
metadata = attr.ib( | |||||
type=Optional[Dict[str, Any]], default=None, validator=type_validator() | |||||
) | |||||
class MetadataTargetType(Enum): | |||||
"""The type of object extrinsic metadata refer to.""" | |||||
CONTENT = "content" | |||||
DIRECTORY = "directory" | |||||
REVISION = "revision" | |||||
RELEASE = "release" | |||||
SNAPSHOT = "snapshot" | |||||
ORIGIN = "origin" | |||||
@attr.s(frozen=True) | |||||
class RawExtrinsicMetadata(BaseModel): | |||||
# target object | |||||
type = attr.ib(type=MetadataTargetType, validator=type_validator()) | |||||
id = attr.ib(type=Union[str, SWHID], validator=type_validator()) | |||||
Done Inline Actionswhy not an Optional[int] here? what this visit refers to? the visit of the metadata fetcher tool? If so it should better be moved in the "# source" block below. If not, it must refers to the origin visit (by a loader), in which case I do not really see the need for this field. douardda: why not an `Optional[int]` here? what this visit refers to? the visit of the metadata fetcher… | |||||
Done Inline Actionstypo vlorentz: typo | |||||
"""URL if type=MetadataTargetType.ORIGIN, else core SWHID""" | |||||
# source | |||||
discovery_date = attr.ib(type=datetime.datetime, validator=type_validator()) | |||||
authority = attr.ib(type=MetadataAuthority, validator=type_validator()) | |||||
fetcher = attr.ib(type=MetadataFetcher, validator=type_validator()) | |||||
# the metadata itself | |||||
format = attr.ib(type=str, validator=type_validator()) | |||||
metadata = attr.ib(type=bytes, validator=type_validator()) | |||||
# context | |||||
origin = attr.ib(type=Optional[str], default=None, validator=type_validator()) | |||||
visit = attr.ib(type=Optional[int], default=None, validator=type_validator()) | |||||
snapshot = attr.ib(type=Optional[SWHID], default=None, validator=type_validator()) | |||||
release = attr.ib(type=Optional[SWHID], default=None, validator=type_validator()) | |||||
revision = attr.ib(type=Optional[SWHID], default=None, validator=type_validator()) | |||||
path = attr.ib(type=Optional[bytes], default=None, validator=type_validator()) | |||||
directory = attr.ib(type=Optional[SWHID], default=None, validator=type_validator()) | |||||
@id.validator | |||||
def check_id(self, attribute, value): | |||||
if self.type == MetadataTargetType.ORIGIN: | |||||
if isinstance(value, SWHID) or value.startswith("swh:"): | |||||
raise ValueError( | |||||
"Got SWHID as id for origin metadata (expected an URL)." | |||||
) | |||||
else: | |||||
self._check_pid(self.type.value, value) | |||||
@origin.validator | |||||
def check_origin(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in ( | |||||
MetadataTargetType.SNAPSHOT, | |||||
MetadataTargetType.RELEASE, | |||||
MetadataTargetType.REVISION, | |||||
MetadataTargetType.DIRECTORY, | |||||
MetadataTargetType.CONTENT, | |||||
): | |||||
raise ValueError( | |||||
f"Unexpected 'origin' context for {self.type.value} object: {value}" | |||||
) | |||||
if value.startswith("swh:"): | |||||
# Technically this is valid; but: | |||||
# 1. SWHIDs are URIs, not URLs | |||||
# 2. if a SWHID gets here, it's very likely to be a mistake | |||||
# (and we can remove this check if it turns out there is a | |||||
# legitimate use for it). | |||||
raise ValueError(f"SWHID used as context origin URL: {value}") | |||||
@visit.validator | |||||
def check_visit(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in ( | |||||
MetadataTargetType.SNAPSHOT, | |||||
MetadataTargetType.RELEASE, | |||||
MetadataTargetType.REVISION, | |||||
MetadataTargetType.DIRECTORY, | |||||
MetadataTargetType.CONTENT, | |||||
): | |||||
raise ValueError( | |||||
f"Unexpected 'visit' context for {self.type.value} object: {value}" | |||||
) | |||||
if self.origin is None: | |||||
raise ValueError("'origin' context must be set if 'visit' is.") | |||||
if value <= 0: | |||||
raise ValueError("Nonpositive visit id") | |||||
@snapshot.validator | |||||
def check_snapshot(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in ( | |||||
MetadataTargetType.RELEASE, | |||||
MetadataTargetType.REVISION, | |||||
MetadataTargetType.DIRECTORY, | |||||
MetadataTargetType.CONTENT, | |||||
): | |||||
raise ValueError( | |||||
f"Unexpected 'snapshot' context for {self.type.value} object: {value}" | |||||
) | |||||
self._check_pid("snapshot", value) | |||||
@release.validator | |||||
def check_release(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in ( | |||||
MetadataTargetType.REVISION, | |||||
MetadataTargetType.DIRECTORY, | |||||
MetadataTargetType.CONTENT, | |||||
): | |||||
raise ValueError( | |||||
f"Unexpected 'release' context for {self.type.value} object: {value}" | |||||
) | |||||
self._check_pid("release", value) | |||||
@revision.validator | |||||
def check_revision(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in (MetadataTargetType.DIRECTORY, MetadataTargetType.CONTENT,): | |||||
raise ValueError( | |||||
f"Unexpected 'revision' context for {self.type.value} object: {value}" | |||||
) | |||||
self._check_pid("revision", value) | |||||
@path.validator | |||||
def check_path(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in (MetadataTargetType.DIRECTORY, MetadataTargetType.CONTENT,): | |||||
raise ValueError( | |||||
f"Unexpected 'path' context for {self.type.value} object: {value}" | |||||
) | |||||
@directory.validator | |||||
def check_directory(self, attribute, value): | |||||
if value is None: | |||||
return | |||||
if self.type not in (MetadataTargetType.CONTENT,): | |||||
raise ValueError( | |||||
f"Unexpected 'directory' context for {self.type.value} object: {value}" | |||||
) | |||||
self._check_pid("directory", value) | |||||
def _check_pid(self, expected_object_type, pid): | |||||
if isinstance(pid, str): | |||||
raise ValueError(f"Expected SWHID, got a string: {pid}") | |||||
if pid.object_type != expected_object_type: | |||||
raise ValueError( | |||||
f"Expected SWHID type '{expected_object_type}', " | |||||
f"got '{pid.object_type}' in {pid}" | |||||
) | |||||
if pid.metadata: | |||||
raise ValueError(f"Expected core SWHID, but got: {pid}") |
can't we directly take the path of immutability while at it here, e.g. using a Tuple[Tuple[bytes, bytes], ...] as metadata type instead?