diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -6,6 +6,7 @@ dh-python (>= 2), python3-aiohttp, python3-all, + python3-attr, python3-click, python3-dateutil, python3-flask, diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ python-dateutil vcversioner aiohttp +attr diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -12,8 +12,11 @@ import datetime import itertools import random +from typing import Optional, Tuple import warnings +import attr + from swh.model.hashutil import DEFAULT_ALGORITHMS from swh.model.identifiers import normalize_timestamp @@ -25,15 +28,220 @@ OriginVisitKey = collections.namedtuple('OriginVisitKey', 'origin date') +class DuplicateKey(Exception): + pass + + +_TABLE_GET_NO_DEFAULT = object() +"""Unique object, to indicate no default is supplied to get().""" + + +Sha1Git = bytes + + +@attr.s +class Person: + name = attr.ib(type=bytes) + email = attr.ib(type=bytes) + fullname = attr.ib(type=bytes) + + +@attr.s +class Timestamp: + seconds = attr.ib(type=int) + microseconds = attr.ib(type=int) + + +@attr.s +class Date: + timestamp = attr.ib(type=Timestamp) + offset = attr.ib(type=int) + negative_utc = attr.ib(type=Optional[bool]) + + @classmethod + def from_model(cls, date): + date = normalize_timestamp(date) + return cls( + timestamp=Timestamp(**date.pop('timestamp')), + **date, + ) + + def to_model(self): + return attr.asdict(self) + + +@attr.s +class DirectoryEntry: + perms = attr.ib(type=int) + name = attr.ib(type=bytes) + target = attr.ib(type=Sha1Git) + type = attr.ib(type=str) + + def to_model(self): + return attr.asdict(self) + + +@attr.s +class Directory: + KEYS = ('id',) + id = attr.ib(type=Sha1Git) + entries = attr.ib(type=Tuple[DirectoryEntry]) + + @classmethod + def from_model(cls, dir_): + dir_ = dir_.copy() + return cls( + entries=tuple(DirectoryEntry(**e) for e in dir_.pop('entries')), + **dir_) + + def to_model(self): + return attr.asdict(self) + + +@attr.s +class Revision: + KEYS = ('id',) + + id = attr.ib(type=Sha1Git) + message = attr.ib(type=bytes) + author = attr.ib(type=Person) + committer = attr.ib(type=Person) + date = attr.ib(type=Date) + committer_date = attr.ib(type=Date) + parents = attr.ib(type=Tuple[Sha1Git]) + type = attr.ib(type=str) + directory = attr.ib(type=Sha1Git) + metadata = attr.ib(type=str) + synthetic = attr.ib(type=bool) + + @classmethod + def from_model(cls, rev): + rev = rev.copy() + return cls( + date=Date.from_model(rev.pop('date')), + committer_date=Date.from_model(rev.pop('committer_date')), + author=Person(**rev.pop('author')), + committer=Person(**rev.pop('committer')), + **rev) + + def to_model(self): + rev = attr.asdict(self) + rev['date'] = self.date.to_model() + rev['committer_date'] = self.committer_date.to_model() + return rev + + +@attr.s +class Release: + KEYS = ('id',) + + id = attr.ib(type=Sha1Git) + name = attr.ib(type=bytes) + message = attr.ib(type=bytes) + date = attr.ib(type=Date) + author = attr.ib(type=Person) + target = attr.ib(type=Sha1Git) + target_type = attr.ib(type=str) + synthetic = attr.ib(type=bool) + + @classmethod + def from_model(cls, rel): + rel = rel.copy() + return cls( + date=Date.from_model(rel.pop('date')), + author=Person(**rel.pop('author')), + **rel) + + def to_model(self): + rel = attr.asdict(self) + rel['date'] = self.date.to_model() + return rel + + +class Table: + """Subclasses of this class holds data in a way similar to an + SQL table. In particular, it enforces all fields are present and have + a nice (ie. serialization-friendly immutable) type. + + Each row's `keys` are expected to be unique.""" + def __init__(self): + self._data = {} + + def _make_key_from_record(self, record): + return tuple(getattr(record, col) for col in self.Record.KEYS) + + def _make_key(self, **kwargs): + if set(kwargs) != set(self.Record.KEYS): + raise TypeError('Expected keys %s, got %s' % ( + self.Record.KEYS, tuple(kwargs))) + return tuple(kwargs[col] for col in self.Record.KEYS) + + def add(self, record): + """Adds a new row in the table. + The expected keyword args are the set of keys **and** of columns + passed to `__init__`. + + Raises: + DuplicateKey + """ + key = self._make_key_from_record(record) + if key in self._data: + raise DuplicateKey(key) + self._data[key] = record + return key + + def contains(self, **kwargs): + """Returns whether there is row in the table whose key matches + the arguments. + The expected keyword args are the set of **keys** passed to + `__init__`. + """ + key = self._make_key(**kwargs) + return key in self._data + + def get(self, *, default=_TABLE_GET_NO_DEFAULT, **kwargs): + """Returns the row in the table whose key matches the arguments, + if any. + The expected keyword args are the set of **keys** passed to + `__init__`. + + Returns: + self.Record: a namedtuple whose fields are the keys **and** + columns passed to `__init__`. + + Raises: + KeyError: if there is no row with that key and `default=` was + not provided. + """ + key = self._make_key(**kwargs) + if default is _TABLE_GET_NO_DEFAULT: + return self._data[key] + else: + return self._data.get(key, default) + + +class DirectoryTable(Table): + Record = Directory + + +class RevisionsTable(Table): + Record = Revision + + +class ReleasesTable(Table): + Record = Release + + class Storage: def __init__(self): self._contents = {} self._contents_data = {} self._content_indexes = defaultdict(lambda: defaultdict(set)) - self._directories = {} - self._revisions = {} - self._releases = {} + self._directories = DirectoryTable() + self._revisions = RevisionsTable() + self._releases = ReleasesTable() + self._snapshots = {} self._origins = {} self._origin_visits = {} @@ -197,8 +405,8 @@ - perms (int): entry permissions """ for directory in directories: - if directory['id'] not in self._directories: - self._directories[directory['id']] = copy.deepcopy(directory) + if not self._directories.contains(id=directory['id']): + self._directories.add(Directory.from_model(directory)) self._objects[directory['id']].append( ('directory', directory['id'])) @@ -213,7 +421,7 @@ """ for id in directory_ids: - if id not in self._directories: + if not self._directories.contains(id=id): yield id def _join_dentry_to_content(self, dentry): @@ -225,7 +433,7 @@ 'length', ) ret = dict.fromkeys(keys) - ret.update(dentry) + ret.update(dentry.to_model()) if ret['type'] == 'file': content = self.content_find({'sha1_git': ret['target']}) if content: @@ -244,8 +452,8 @@ List of entries for such directory. """ - if directory_id in self._directories: - for entry in self._directories[directory_id]['entries']: + if self._directories.contains(id=directory_id): + for entry in self._directories.get(id=directory_id).entries: ret = self._join_dentry_to_content(entry) ret['dir_id'] = directory_id yield ret @@ -312,14 +520,12 @@ date dictionaries have the form defined in :mod:`swh.model`. """ - for revision in revisions: - if revision['id'] not in self._revisions: - self._revisions[revision['id']] = rev = copy.deepcopy(revision) - rev['date'] = normalize_timestamp(rev.get('date')) - rev['committer_date'] = normalize_timestamp( - rev.get('committer_date')) - self._objects[revision['id']].append( - ('revision', revision['id'])) + for rev in revisions: + if not self._revisions.contains(id=rev['id']): + self._revisions.add(Revision.from_model(rev)) + + self._objects[rev['id']].append( + ('revision', rev['id'])) def revision_missing(self, revision_ids): """List revisions missing from storage @@ -332,12 +538,16 @@ """ for id in revision_ids: - if id not in self._revisions: + if not self._revisions.contains(id=id): yield id def revision_get(self, revision_ids): for id in revision_ids: - yield copy.deepcopy(self._revisions.get(id)) + rev = self._revisions.get(id=id, default=None) + if rev: + yield rev.to_model() + else: + yield None def _get_parent_revs(self, rev_id, seen, limit): if limit and len(seen) >= limit: @@ -345,8 +555,9 @@ if rev_id in seen: return seen.add(rev_id) - yield self._revisions[rev_id] - for parent in self._revisions[rev_id]['parents']: + rev = self._revisions.get(id=rev_id) + yield rev + for parent in rev.parents: yield from self._get_parent_revs(parent, seen, limit) def revision_log(self, revision_ids, limit=None): @@ -362,7 +573,8 @@ """ seen = set() for rev_id in revision_ids: - yield from self._get_parent_revs(rev_id, seen, limit) + for rev in self._get_parent_revs(rev_id, seen, limit): + yield rev.to_model() def revision_shortlog(self, revisions, limit=None): """Fetch the shortlog for the given revisions @@ -397,10 +609,10 @@ the date dictionary has the form defined in :mod:`swh.model`. """ for rel in releases: - rel['date'] = normalize_timestamp(rel['date']) - self._objects[rel['id']].append( - ('release', rel['id'])) - self._releases.update((rel['id'], rel) for rel in releases) + r = self._releases.add(Release.from_model(rel)) + print(repr(r)) + (rel_id,) = r + self._objects[rel['id']].append(('release', rel_id)) def release_missing(self, releases): """List releases missing from storage @@ -412,7 +624,9 @@ a list of missing release ids """ - yield from (rel for rel in releases if rel not in self._releases) + for rel_id in releases: + if not self._releases.contains(id=rel_id): + yield rel_id def release_get(self, releases): """Given a list of sha1, return the releases's information @@ -427,7 +641,8 @@ ValueError: if the keys does not match (url and type) nor id. """ - yield from map(self._releases.__getitem__, releases) + for rel_id in releases: + yield self._releases.get(id=rel_id).to_model() def snapshot_add(self, origin, visit, snapshot): """Add a snapshot for the given origin/visit couple @@ -989,8 +1204,8 @@ for item in self._origin_metadata[origin_id]: item = copy.deepcopy(item) provider = self.metadata_provider_get(item['provider_id']) - for attr in ('name', 'type', 'url'): - item['provider_' + attr] = provider[attr] + for attr_name in ('name', 'type', 'url'): + item['provider_' + attr_name] = provider[attr_name] metadata.append(item) return metadata