diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import re +import math import bisect import dateutil import collections @@ -14,6 +15,7 @@ import random import warnings +from swh.core.serializers import msgpack_loads, msgpack_dumps from swh.model.hashutil import DEFAULT_ALGORITHMS from swh.model.identifiers import normalize_timestamp @@ -25,15 +27,166 @@ OriginVisitKey = collections.namedtuple('OriginVisitKey', 'origin date') +class DuplicateKey(Exception): + pass + + +_TABLE_GET_NO_DEFAULT = object() +"""Unique object, to indicate no default is supplied to get().""" + + +TYPES_WHITELIST = (int, float, str, bytes, tuple, frozenset, + datetime.datetime, None.__class__) + + +def date_from_model(d, *, prefix=''): + d = normalize_timestamp(d) + return { + prefix + 'date': (d['timestamp']['seconds'] + + 0.000001*d['timestamp']['microseconds']), + prefix + 'date_offset': d['offset'], + prefix + 'date_neg_utc_offset': d['negative_utc']} + + +def date_to_model(timestamp, date_offset, date_neg_utc_offset): + (msec, sec) = math.modf(timestamp) + print(timestamp, sec, msec) + return { + 'timestamp': { + 'seconds': int(sec), + 'microseconds': int(msec*1000000)}, + 'offset': date_offset, + 'negative_utc': date_neg_utc_offset} + + +def revision_to_model(rev): + if rev is None: + return None + author_date = date_to_model( + rev.date, rev.date_offset, rev.date_neg_utc_offset) + committer_date = date_to_model( + rev.committer_date, rev.committer_date_offset, + rev.committer_date_neg_utc_offset) + return { + 'id': rev.id, + 'message': rev.message, + 'date': author_date, + 'committer_date': committer_date, + 'author': { + 'name': rev.author_name, + 'fullname': rev.author_fullname, + 'email': rev.author_email}, + 'committer': { + 'name': rev.committer_name, + 'fullname': rev.committer_fullname, + 'email': rev.committer_email}, + 'parents': list(rev.parents), + 'type': rev.type, + 'directory': rev.directory, + 'metadata': msgpack_loads(rev.metadata), + 'synthetic': rev.synthetic} + + +def release_to_model(rel): + if rel is None: + return None + author_date = date_to_model( + rel.date, rel.date_offset, rel.date_neg_utc_offset) + return { + 'id': rel.id, + 'message': rel.message, + 'date': author_date, + 'author': { + 'name': rel.author_name, + 'fullname': rel.author_fullname, + 'email': rel.author_email}, + 'target': rel.target, + 'target_type': rel.target_type, + 'name': rel.name, + 'synthetic': rel.synthetic} + + +def person_from_model(person, *, prefix): + return { + prefix + 'name': person['name'], + prefix + 'email': person['email'], + prefix + 'fullname': person['fullname']} + + +DirectoryEntry = collections.namedtuple( + 'DirectoryEntry', 'perms name target type') + + +def check_types(value): + if isinstance(value, TYPES_WHITELIST): + if isinstance(value, (tuple, frozenset)): + for v in value: + check_types(v) + return value + else: + raise TypeError('Type of {!r} ({}) is not allowed in tables.' + .format(value, value.__class__)) + + +class Table: + def __init__(self, name, keys, columns): + if not set(keys).isdisjoint(set(columns)): + raise ValueError('%r is not disjoint from %r' % (keys, columns)) + if 'default' in keys: + raise ValueError('"default" is not a valid key name.') + self._keys = keys + self._columns = columns + self.Key = collections.namedtuple(name + 'Key', keys) + self.Record = collections.namedtuple(name + 'Record', keys + columns) + self._data = {} + + def _make_key_from_record(self, record): + return self.Key(*(getattr(record, col) for col in self._keys)) + + def add(self, **kwargs): + record = self.Record(**kwargs) + check_types(record) + key = self._make_key_from_record(record) + if key in self._data: + raise DuplicateKey(key) + self._data[key] = record + return key + + def contains(self, **kwargs): + key = self.Key(**kwargs) + check_types(key) + return key in self._data + + def get(self, *, default=_TABLE_GET_NO_DEFAULT, **kwargs): + key = self.Key(**kwargs) + check_types(key) + if default is _TABLE_GET_NO_DEFAULT: + return self._data[key] + else: + return self._data.get(key, default) + + class Storage: def __init__(self): self._contents = {} self._contents_data = {} self._content_indexes = defaultdict(lambda: defaultdict(set)) - self._directories = {} - self._revisions = {} - self._releases = {} + self._directories = Table('Directories', ['id'], ['entries']) + self._revisions = Table( + 'Revisions', ['id'], + ['message', 'author_name', 'author_email', 'author_fullname', + 'committer_name', 'committer_email', 'committer_fullname', + 'date', 'date_offset', 'date_neg_utc_offset', + 'committer_date', 'committer_date_offset', + 'committer_date_neg_utc_offset', + 'parents', 'type', 'directory', 'metadata', 'synthetic']) + self._releases = Table( + 'Releases', ['id'], + ['name', 'message', 'date', 'date_offset', 'date_neg_utc_offset', + 'author_name', 'author_email', 'author_fullname', + 'target', 'target_type', 'synthetic']) + self._snapshots = {} self._origins = {} self._origin_visits = {} @@ -189,8 +342,12 @@ - perms (int): entry permissions """ for directory in directories: - if directory['id'] not in self._directories: - self._directories[directory['id']] = copy.deepcopy(directory) + if not self._directories.contains(id=directory['id']): + entries = [DirectoryEntry(**entry) + for entry in directory['entries']] + self._directories.add( + id=directory['id'], + entries=tuple(entries)) self._objects[directory['id']].append( ('directory', directory['id'])) @@ -205,7 +362,7 @@ """ for id in directory_ids: - if id not in self._directories: + if not self._directories.contains(id=id): yield id def _join_dentry_to_content(self, dentry): @@ -217,7 +374,7 @@ 'length', ) ret = dict.fromkeys(keys) - ret.update(dentry) + ret.update(dentry._asdict()) if ret['type'] == 'file': content = self.content_find({'sha1_git': ret['target']}) if content: @@ -236,8 +393,8 @@ List of entries for such directory. """ - if directory_id in self._directories: - for entry in self._directories[directory_id]['entries']: + if self._directories.contains(id=directory_id): + for entry in self._directories.get(id=directory_id).entries: ret = self._join_dentry_to_content(entry) ret['dir_id'] = directory_id yield ret @@ -310,14 +467,27 @@ - parents (list of sha1_git): the parents of this revision """ - for revision in revisions: - if revision['id'] not in self._revisions: - self._revisions[revision['id']] = rev = copy.deepcopy(revision) - rev['date'] = normalize_timestamp(rev.get('date')) - rev['committer_date'] = normalize_timestamp( - rev.get('committer_date')) - self._objects[revision['id']].append( - ('revision', revision['id'])) + for rev in revisions: + if not self._revisions.contains(id=rev['id']): + author_date_dict = date_from_model(rev['date']) + committer_date_dict = date_from_model(rev['committer_date'], + prefix='committer_') + self._revisions.add( + id=rev['id'], + message=rev['message'], + **author_date_dict, + **committer_date_dict, + **person_from_model(rev['author'], prefix='author_'), + **person_from_model(rev['committer'], prefix='committer_'), + parents=tuple(rev['parents']), + type=rev['type'], + directory=rev['directory'], + metadata=msgpack_dumps(rev['metadata']), + synthetic=rev['synthetic'], + ) + + self._objects[rev['id']].append( + ('revision', rev['id'])) def revision_missing(self, revision_ids): """List revisions missing from storage @@ -330,12 +500,12 @@ """ for id in revision_ids: - if id not in self._revisions: + if not self._revisions.contains(id=id): yield id def revision_get(self, revision_ids): for id in revision_ids: - yield copy.deepcopy(self._revisions.get(id)) + yield revision_to_model(self._revisions.get(id=id, default=None)) def _get_parent_revs(self, rev_id, seen, limit): if limit and len(seen) >= limit: @@ -343,8 +513,9 @@ if rev_id in seen: return seen.add(rev_id) - yield self._revisions[rev_id] - for parent in self._revisions[rev_id]['parents']: + rev = self._revisions.get(id=rev_id) + yield rev + for parent in rev.parents: yield from self._get_parent_revs(parent, seen, limit) def revision_log(self, revision_ids, limit=None): @@ -360,7 +531,8 @@ """ seen = set() for rev_id in revision_ids: - yield from self._get_parent_revs(rev_id, seen, limit) + for rev in self._get_parent_revs(rev_id, seen, limit): + yield revision_to_model(rev) def revision_shortlog(self, revisions, limit=None): """Fetch the shortlog for the given revisions @@ -397,10 +569,17 @@ """ for rel in releases: - rel['date'] = normalize_timestamp(rel['date']) - self._objects[rel['id']].append( - ('release', rel['id'])) - self._releases.update((rel['id'], rel) for rel in releases) + date_dict = date_from_model(rel['date']) + self._releases.add( + id=rel['id'], + name=rel['name'], + message=rel['message'], + **date_dict, + **person_from_model(rel['author'], prefix='author_'), + target=rel['target'], + target_type=rel['target_type'], + synthetic=rel['synthetic']) + self._objects[rel['id']].append(('release', rel['id'])) def release_missing(self, releases): """List releases missing from storage @@ -412,7 +591,9 @@ a list of missing release ids """ - yield from (rel for rel in releases if rel not in self._releases) + for rel_id in releases: + if not self._releases.contains(id=rel_id): + yield rel_id def release_get(self, releases): """Given a list of sha1, return the releases's information @@ -427,7 +608,8 @@ ValueError: if the keys does not match (url and type) nor id. """ - yield from map(self._releases.__getitem__, releases) + for rel_id in releases: + yield release_to_model(self._releases.get(id=rel_id)) def snapshot_add(self, origin, visit, snapshot): """Add a snapshot for the given origin/visit couple