diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py index 9741394a..83c1ae25 100644 --- a/swh/storage/__init__.py +++ b/swh/storage/__init__.py @@ -1,97 +1,97 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import warnings from . import storage Storage = storage.Storage class HashCollision(Exception): pass STORAGE_IMPLEMENTATION = { 'pipeline', 'local', 'remote', 'memory', 'filter', 'buffer', 'retry', } def get_storage(cls, **kwargs): """Get a storage object of class `storage_class` with arguments `storage_args`. Args: storage (dict): dictionary with keys: - cls (str): storage's class, either local, remote, memory, filter, buffer - args (dict): dictionary with keys Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ if cls not in STORAGE_IMPLEMENTATION: raise ValueError('Unknown storage class `%s`. Supported: %s' % ( cls, ', '.join(STORAGE_IMPLEMENTATION))) if 'args' in kwargs: warnings.warn( 'Explicit "args" key is deprecated, use keys directly instead.', DeprecationWarning) kwargs = kwargs['args'] if cls == 'pipeline': return get_storage_pipeline(**kwargs) if cls == 'remote': from .api.client import RemoteStorage as Storage elif cls == 'local': from .storage import Storage elif cls == 'memory': - from .in_memory import Storage + from .in_memory import InMemoryStorage as Storage elif cls == 'filter': from .filter import FilteringProxyStorage as Storage elif cls == 'buffer': from .buffer import BufferingProxyStorage as Storage elif cls == 'retry': from .retry import RetryingProxyStorage as Storage return Storage(**kwargs) def get_storage_pipeline(steps): """Recursively get a storage object that may use other storage objects as backends. Args: steps (List[dict]): List of dicts that may be used as kwargs for `get_storage`. Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ storage_config = None for step in reversed(steps): if 'args' in step: warnings.warn( 'Explicit "args" key is deprecated, use keys directly ' 'instead.', DeprecationWarning) step = { 'cls': step['cls'], **step['args'], } if storage_config: step['storage'] = storage_config storage_config = step return get_storage(**storage_config) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py index 656f040d..6e260138 100644 --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -1,1053 +1,1053 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import bisect import dateutil import collections import copy import datetime import itertools import random from collections import defaultdict from datetime import timedelta from typing import Any, Dict, List, Optional import attr from swh.model.model import ( Content, Directory, Revision, Release, Snapshot, OriginVisit, Origin, SHA1_SIZE) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from .storage import get_journal_writer from .converters import origin_url_to_sha1 from .utils import get_partition_bounds_bytes # Max block size of contents to return BULK_BLOCK_CONTENT_LEN_MAX = 10000 def now(): return datetime.datetime.now(tz=datetime.timezone.utc) -class Storage: +class InMemoryStorage: def __init__(self, journal_writer=None): self._contents = {} self._content_indexes = defaultdict(lambda: defaultdict(set)) self._skipped_contents = {} self._skipped_content_indexes = defaultdict(lambda: defaultdict(set)) self.reset() if journal_writer: self.journal_writer = get_journal_writer(**journal_writer) else: self.journal_writer = None def reset(self): self._directories = {} self._revisions = {} self._releases = {} self._snapshots = {} self._origins = {} self._origins_by_id = [] self._origins_by_sha1 = {} self._origin_visits = {} self._persons = [] self._origin_metadata = defaultdict(list) self._tools = {} self._metadata_providers = {} self._objects = defaultdict(list) # ideally we would want a skip list for both fast inserts and searches self._sorted_sha1s = [] self.objstorage = get_objstorage('memory', {}) def check_config(self, *, check_write): return True def _content_add(self, contents, with_data): content_with_data = [] content_without_data = [] for content in contents: if content.status is None: content.status = 'visible' if content.length is None: content.length = -1 if content.status != 'absent': if self._content_key(content) not in self._contents: content_with_data.append(content) else: if self._content_key(content) not in self._skipped_contents: content_without_data.append(content) if self.journal_writer: for content in content_with_data: content = attr.evolve(content, data=None) self.journal_writer.write_addition('content', content) for content in content_without_data: self.journal_writer.write_addition('content', content) count_content_added, count_content_bytes_added = \ self._content_add_present(content_with_data, with_data) count_skipped_content_added = self._content_add_absent( content_without_data ) summary = { 'content:add': count_content_added, 'skipped_content:add': count_skipped_content_added, } if with_data: summary['content:add:bytes'] = count_content_bytes_added return summary def _content_add_present(self, contents, with_data): count_content_added = 0 count_content_bytes_added = 0 for content in contents: key = self._content_key(content) if key in self._contents: continue for algorithm in DEFAULT_ALGORITHMS: hash_ = content.get_hash(algorithm) if hash_ in self._content_indexes[algorithm]\ and (algorithm not in {'blake2s256', 'sha256'}): from . import HashCollision raise HashCollision(algorithm, hash_, key) for algorithm in DEFAULT_ALGORITHMS: hash_ = content.get_hash(algorithm) self._content_indexes[algorithm][hash_].add(key) self._objects[content.sha1_git].append( ('content', content.sha1)) self._contents[key] = content bisect.insort(self._sorted_sha1s, content.sha1) count_content_added += 1 if with_data: content_data = self._contents[key].data self._contents[key] = attr.evolve( self._contents[key], data=None) count_content_bytes_added += len(content_data) self.objstorage.add(content_data, content.sha1) return (count_content_added, count_content_bytes_added) def _content_add_absent(self, contents): count = 0 skipped_content_missing = self.skipped_content_missing(contents) for content in skipped_content_missing: key = self._content_key(content) for algo in DEFAULT_ALGORITHMS: self._skipped_content_indexes[algo][content.get_hash(algo)] \ .add(key) self._skipped_contents[key] = content count += 1 return count def _content_to_model(self, contents): for content in contents: content = content.copy() content.pop('origin', None) yield Content.from_dict(content) def content_add(self, content): now = datetime.datetime.now(tz=datetime.timezone.utc) content = [attr.evolve(c, ctime=now) for c in self._content_to_model(content)] return self._content_add(content, with_data=True) def content_update(self, content, keys=[]): if self.journal_writer: raise NotImplementedError( 'content_update is not yet supported with a journal_writer.') for cont_update in content: cont_update = cont_update.copy() sha1 = cont_update.pop('sha1') for old_key in self._content_indexes['sha1'][sha1]: old_cont = self._contents.pop(old_key) for algorithm in DEFAULT_ALGORITHMS: hash_ = old_cont.get_hash(algorithm) self._content_indexes[algorithm][hash_].remove(old_key) new_cont = attr.evolve(old_cont, **cont_update) new_key = self._content_key(new_cont) self._contents[new_key] = new_cont for algorithm in DEFAULT_ALGORITHMS: hash_ = new_cont.get_hash(algorithm) self._content_indexes[algorithm][hash_].add(new_key) def content_add_metadata(self, content): content = list(self._content_to_model(content)) return self._content_add(content, with_data=False) def content_get(self, content): # FIXME: Make this method support slicing the `data`. if len(content) > BULK_BLOCK_CONTENT_LEN_MAX: raise ValueError( "Sending at most %s contents." % BULK_BLOCK_CONTENT_LEN_MAX) for obj_id in content: try: data = self.objstorage.get(obj_id) except ObjNotFoundError: yield None continue yield {'sha1': obj_id, 'data': data} def content_get_range(self, start, end, limit=1000): if limit is None: raise ValueError('Development error: limit should not be None') from_index = bisect.bisect_left(self._sorted_sha1s, start) sha1s = itertools.islice(self._sorted_sha1s, from_index, None) sha1s = ((sha1, content_key) for sha1 in sha1s for content_key in self._content_indexes['sha1'][sha1]) matched = [] next_content = None for sha1, key in sha1s: if sha1 > end: break if len(matched) >= limit: next_content = sha1 break matched.append(self._contents[key].to_dict()) return { 'contents': matched, 'next': next_content, } def content_get_partition( self, partition_id: int, nb_partitions: int, limit: int = 1000, page_token: str = None): if limit is None: raise ValueError('Development error: limit should not be None') (start, end) = get_partition_bounds_bytes( partition_id, nb_partitions, SHA1_SIZE) if page_token: start = hash_to_bytes(page_token) if end is None: end = b'\xff'*SHA1_SIZE result = self.content_get_range(start, end, limit) result2 = { 'contents': result['contents'], 'next_page_token': None, } if result['next']: result2['next_page_token'] = hash_to_hex(result['next']) return result2 def content_get_metadata( self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: result: Dict = {sha1: [] for sha1 in contents} for sha1 in contents: if sha1 in self._content_indexes['sha1']: objs = self._content_indexes['sha1'][sha1] # only 1 element as content_add_metadata would have raised a # hash collision otherwise for key in objs: d = self._contents[key].to_dict() del d['ctime'] if 'data' in d: del d['data'] result[sha1].append(d) return result def content_find(self, content): if not set(content).intersection(DEFAULT_ALGORITHMS): raise ValueError('content keys must contain at least one of: ' '%s' % ', '.join(sorted(DEFAULT_ALGORITHMS))) found = [] for algo in DEFAULT_ALGORITHMS: hash = content.get(algo) if hash and hash in self._content_indexes[algo]: found.append(self._content_indexes[algo][hash]) if not found: return [] keys = list(set.intersection(*found)) return [self._contents[key].to_dict() for key in keys] def content_missing(self, content, key_hash='sha1'): for cont in content: for (algo, hash_) in cont.items(): if algo not in DEFAULT_ALGORITHMS: continue if hash_ not in self._content_indexes.get(algo, []): yield cont[key_hash] break else: for result in self.content_find(cont): if result['status'] == 'missing': yield cont[key_hash] def content_missing_per_sha1(self, contents): for content in contents: if content not in self._content_indexes['sha1']: yield content def content_missing_per_sha1_git(self, contents): for content in contents: if content not in self._content_indexes['sha1_git']: yield content def skipped_content_missing(self, contents): for content in contents: for (key, algorithm) in self._content_key_algorithm(content): if algorithm == 'blake2s256': continue if key not in self._skipped_content_indexes[algorithm]: # index must contain hashes of algos except blake2s256 # else the content is considered skipped yield content break def content_get_random(self): return random.choice(list(self._content_indexes['sha1_git'])) def directory_add(self, directories): directories = list(directories) if self.journal_writer: self.journal_writer.write_additions( 'directory', (dir_ for dir_ in directories if dir_['id'] not in self._directories)) directories = [Directory.from_dict(d) for d in directories] count = 0 for directory in directories: if directory.id not in self._directories: count += 1 self._directories[directory.id] = directory self._objects[directory.id].append( ('directory', directory.id)) return {'directory:add': count} def directory_missing(self, directories): for id in directories: if id not in self._directories: yield id def _join_dentry_to_content(self, dentry): keys = ( 'status', 'sha1', 'sha1_git', 'sha256', 'length', ) ret = dict.fromkeys(keys) ret.update(dentry) if ret['type'] == 'file': # TODO: Make it able to handle more than one content content = self.content_find({'sha1_git': ret['target']}) if content: content = content[0] for key in keys: ret[key] = content[key] return ret def _directory_ls(self, directory_id, recursive, prefix=b''): if directory_id in self._directories: for entry in self._directories[directory_id].entries: ret = self._join_dentry_to_content(entry.to_dict()) ret['name'] = prefix + ret['name'] ret['dir_id'] = directory_id yield ret if recursive and ret['type'] == 'dir': yield from self._directory_ls( ret['target'], True, prefix + ret['name'] + b'/') def directory_ls(self, directory, recursive=False): yield from self._directory_ls(directory, recursive) def directory_entry_get_by_path(self, directory, paths): return self._directory_entry_get_by_path(directory, paths, b'') def directory_get_random(self): if not self._directories: return None return random.choice(list(self._directories)) def _directory_entry_get_by_path(self, directory, paths, prefix): if not paths: return contents = list(self.directory_ls(directory)) if not contents: return def _get_entry(entries, name): for entry in entries: if entry['name'] == name: entry = entry.copy() entry['name'] = prefix + entry['name'] return entry first_item = _get_entry(contents, paths[0]) if len(paths) == 1: return first_item if not first_item or first_item['type'] != 'dir': return return self._directory_entry_get_by_path( first_item['target'], paths[1:], prefix + paths[0] + b'/') def revision_add(self, revisions): revisions = list(revisions) if self.journal_writer: self.journal_writer.write_additions( 'revision', (rev for rev in revisions if rev['id'] not in self._revisions)) revisions = [Revision.from_dict(rev) for rev in revisions] count = 0 for revision in revisions: if revision.id not in self._revisions: revision = attr.evolve( revision, committer=self._person_add(revision.committer), author=self._person_add(revision.author)) self._revisions[revision.id] = revision self._objects[revision.id].append( ('revision', revision.id)) count += 1 return {'revision:add': count} def revision_missing(self, revisions): for id in revisions: if id not in self._revisions: yield id def revision_get(self, revisions): for id in revisions: if id in self._revisions: yield self._revisions.get(id).to_dict() else: yield None def _get_parent_revs(self, rev_id, seen, limit): if limit and len(seen) >= limit: return if rev_id in seen or rev_id not in self._revisions: return seen.add(rev_id) yield self._revisions[rev_id].to_dict() for parent in self._revisions[rev_id].parents: yield from self._get_parent_revs(parent, seen, limit) def revision_log(self, revisions, limit=None): seen = set() for rev_id in revisions: yield from self._get_parent_revs(rev_id, seen, limit) def revision_shortlog(self, revisions, limit=None): yield from ((rev['id'], rev['parents']) for rev in self.revision_log(revisions, limit)) def revision_get_random(self): return random.choice(list(self._revisions)) def release_add(self, releases): releases = list(releases) if self.journal_writer: self.journal_writer.write_additions( 'release', (rel for rel in releases if rel['id'] not in self._releases)) releases = [Release.from_dict(rel) for rel in releases] count = 0 for rel in releases: if rel.id not in self._releases: if rel.author: self._person_add(rel.author) self._objects[rel.id].append( ('release', rel.id)) self._releases[rel.id] = rel count += 1 return {'release:add': count} def release_missing(self, releases): yield from (rel for rel in releases if rel not in self._releases) def release_get(self, releases): for rel_id in releases: if rel_id in self._releases: yield self._releases[rel_id].to_dict() else: yield None def release_get_random(self): return random.choice(list(self._releases)) def snapshot_add(self, snapshots): count = 0 snapshots = (Snapshot.from_dict(d) for d in snapshots) snapshots = (snap for snap in snapshots if snap.id not in self._snapshots) for snapshot in snapshots: if self.journal_writer: self.journal_writer.write_addition('snapshot', snapshot) sorted_branch_names = sorted(snapshot.branches) self._snapshots[snapshot.id] = (snapshot, sorted_branch_names) self._objects[snapshot.id].append(('snapshot', snapshot.id)) count += 1 return {'snapshot:add': count} def snapshot_missing(self, snapshots): for id in snapshots: if id not in self._snapshots: yield id def snapshot_get(self, snapshot_id): return self.snapshot_get_branches(snapshot_id) def snapshot_get_by_origin_visit(self, origin, visit): origin_url = self._get_origin_url(origin) if not origin_url: return if origin_url not in self._origins or \ visit > len(self._origin_visits[origin_url]): return None snapshot_id = self._origin_visits[origin_url][visit-1].snapshot if snapshot_id: return self.snapshot_get(snapshot_id) else: return None def snapshot_get_latest(self, origin, allowed_statuses=None): origin_url = self._get_origin_url(origin) if not origin_url: return visit = self.origin_visit_get_latest( origin_url, allowed_statuses=allowed_statuses, require_snapshot=True) if visit and visit['snapshot']: snapshot = self.snapshot_get(visit['snapshot']) if not snapshot: raise ValueError( 'last origin visit references an unknown snapshot') return snapshot def snapshot_count_branches(self, snapshot_id): (snapshot, _) = self._snapshots[snapshot_id] return collections.Counter(branch.target_type.value if branch else None for branch in snapshot.branches.values()) def snapshot_get_branches(self, snapshot_id, branches_from=b'', branches_count=1000, target_types=None): res = self._snapshots.get(snapshot_id) if res is None: return None (snapshot, sorted_branch_names) = res from_index = bisect.bisect_left( sorted_branch_names, branches_from) if target_types: next_branch = None branches = {} for branch_name in sorted_branch_names[from_index:]: branch = snapshot.branches[branch_name] if branch and branch.target_type.value in target_types: if len(branches) < branches_count: branches[branch_name] = branch else: next_branch = branch_name break else: # As there is no 'target_types', we can do that much faster to_index = from_index + branches_count returned_branch_names = sorted_branch_names[from_index:to_index] branches = {branch_name: snapshot.branches[branch_name] for branch_name in returned_branch_names} if to_index >= len(sorted_branch_names): next_branch = None else: next_branch = sorted_branch_names[to_index] branches = {name: branch.to_dict() if branch else None for (name, branch) in branches.items()} return { 'id': snapshot_id, 'branches': branches, 'next_branch': next_branch, } def snapshot_get_random(self): return random.choice(list(self._snapshots)) def object_find_by_sha1_git(self, ids): ret = {} for id_ in ids: objs = self._objects.get(id_, []) ret[id_] = [{ 'sha1_git': id_, 'type': obj[0], } for obj in objs] return ret def _convert_origin(self, t): if t is None: return None return t.to_dict() def origin_get(self, origins): if isinstance(origins, dict): # Old API return_single = True origins = [origins] else: return_single = False # Sanity check to be error-compatible with the pgsql backend if any('id' in origin for origin in origins) \ and not all('id' in origin for origin in origins): raise ValueError( 'Either all origins or none at all should have an "id".') if any('url' in origin for origin in origins) \ and not all('url' in origin for origin in origins): raise ValueError( 'Either all origins or none at all should have ' 'an "url" key.') results = [] for origin in origins: result = None if 'url' in origin: if origin['url'] in self._origins: result = self._origins[origin['url']] else: raise ValueError( 'Origin must have an url.') results.append(self._convert_origin(result)) if return_single: assert len(results) == 1 return results[0] else: return results def origin_get_by_sha1(self, sha1s): return [ self._convert_origin(self._origins_by_sha1.get(sha1)) for sha1 in sha1s ] def origin_get_range(self, origin_from=1, origin_count=100): origin_from = max(origin_from, 1) if origin_from <= len(self._origins_by_id): max_idx = origin_from + origin_count - 1 if max_idx > len(self._origins_by_id): max_idx = len(self._origins_by_id) for idx in range(origin_from-1, max_idx): origin = self._convert_origin( self._origins[self._origins_by_id[idx]]) yield {'id': idx+1, **origin} def origin_list(self, page_token: Optional[str] = None, limit: int = 100 ) -> dict: origin_urls = sorted(self._origins) if page_token: from_ = bisect.bisect_left(origin_urls, page_token) else: from_ = 0 result = { 'origins': [{'url': origin_url} for origin_url in origin_urls[from_:from_+limit]] } if from_+limit < len(origin_urls): result['next_page_token'] = origin_urls[from_+limit] return result def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False): origins = map(self._convert_origin, self._origins.values()) if regexp: pat = re.compile(url_pattern) origins = [orig for orig in origins if pat.search(orig['url'])] else: origins = [orig for orig in origins if url_pattern in orig['url']] if with_visit: origins = [ orig for orig in origins if len(self._origin_visits[orig['url']]) > 0 and set(ov.snapshot for ov in self._origin_visits[orig['url']] if ov.snapshot) & set(self._snapshots)] return origins[offset:offset+limit] def origin_count(self, url_pattern, regexp=False, with_visit=False): return len(self.origin_search(url_pattern, regexp=regexp, with_visit=with_visit, limit=len(self._origins))) def origin_add(self, origins): origins = copy.deepcopy(list(origins)) for origin in origins: self.origin_add_one(origin) return origins def origin_add_one(self, origin): origin = Origin.from_dict(origin) if origin.url not in self._origins: if self.journal_writer: self.journal_writer.write_addition('origin', origin) # generate an origin_id because it is needed by origin_get_range. # TODO: remove this when we remove origin_get_range origin_id = len(self._origins) + 1 self._origins_by_id.append(origin.url) assert len(self._origins_by_id) == origin_id self._origins[origin.url] = origin self._origins_by_sha1[origin_url_to_sha1(origin.url)] = origin self._origin_visits[origin.url] = [] self._objects[origin.url].append(('origin', origin.url)) return origin.url def origin_visit_add(self, origin, date, type): origin_url = origin if origin_url is None: raise ValueError('Unknown origin.') if isinstance(date, str): # FIXME: Converge on iso8601 at some point date = dateutil.parser.parse(date) elif not isinstance(date, datetime.datetime): raise TypeError('date must be a datetime or a string.') visit_ret = None if origin_url in self._origins: origin = self._origins[origin_url] # visit ids are in the range [1, +inf[ visit_id = len(self._origin_visits[origin_url]) + 1 status = 'ongoing' visit = OriginVisit( origin=origin.url, date=date, type=type, status=status, snapshot=None, metadata=None, visit=visit_id, ) self._origin_visits[origin_url].append(visit) visit_ret = { 'origin': origin.url, 'visit': visit_id, } self._objects[(origin_url, visit_id)].append( ('origin_visit', None)) if self.journal_writer: self.journal_writer.write_addition('origin_visit', visit) return visit_ret def origin_visit_update(self, origin, visit_id, status=None, metadata=None, snapshot=None): if not isinstance(origin, str): raise TypeError('origin must be a string, not %r' % (origin,)) origin_url = self._get_origin_url(origin) if origin_url is None: raise ValueError('Unknown origin.') try: visit = self._origin_visits[origin_url][visit_id-1] except IndexError: raise ValueError('Unknown visit_id for this origin') \ from None updates = {} if status: updates['status'] = status if metadata: updates['metadata'] = metadata if snapshot: updates['snapshot'] = snapshot visit = attr.evolve(visit, **updates) if self.journal_writer: self.journal_writer.write_update('origin_visit', visit) self._origin_visits[origin_url][visit_id-1] = visit def origin_visit_upsert(self, visits): for visit in visits: if not isinstance(visit['origin'], str): raise TypeError("visit['origin'] must be a string, not %r" % (visit['origin'],)) visits = [OriginVisit.from_dict(d) for d in visits] if self.journal_writer: for visit in visits: self.journal_writer.write_addition('origin_visit', visit) for visit in visits: visit_id = visit.visit origin_url = visit.origin visit = attr.evolve(visit, origin=origin_url) self._objects[(origin_url, visit_id)].append( ('origin_visit', None)) while len(self._origin_visits[origin_url]) <= visit_id: self._origin_visits[origin_url].append(None) self._origin_visits[origin_url][visit_id-1] = visit def _convert_visit(self, visit): if visit is None: return visit = visit.to_dict() return visit def origin_visit_get(self, origin, last_visit=None, limit=None): origin_url = self._get_origin_url(origin) if origin_url in self._origin_visits: visits = self._origin_visits[origin_url] if last_visit is not None: visits = visits[last_visit:] if limit is not None: visits = visits[:limit] for visit in visits: if not visit: continue visit_id = visit.visit yield self._convert_visit( self._origin_visits[origin_url][visit_id-1]) def origin_visit_find_by_date(self, origin, visit_date): origin_url = self._get_origin_url(origin) if origin_url in self._origin_visits: visits = self._origin_visits[origin_url] visit = min( visits, key=lambda v: (abs(v.date - visit_date), -v.visit)) return self._convert_visit(visit) def origin_visit_get_by(self, origin, visit): origin_url = self._get_origin_url(origin) if origin_url in self._origin_visits and \ visit <= len(self._origin_visits[origin_url]): return self._convert_visit( self._origin_visits[origin_url][visit-1]) def origin_visit_get_latest( self, origin, allowed_statuses=None, require_snapshot=False): origin = self._origins.get(origin) if not origin: return visits = self._origin_visits[origin.url] if allowed_statuses is not None: visits = [visit for visit in visits if visit.status in allowed_statuses] if require_snapshot: visits = [visit for visit in visits if visit.snapshot] visit = max( visits, key=lambda v: (v.date, v.visit), default=None) return self._convert_visit(visit) def _select_random_origin_visit_by_type(self, type: str) -> str: while True: url = random.choice(list(self._origin_visits.keys())) random_origin_visits = self._origin_visits[url] if random_origin_visits[0].type == type: return url def origin_visit_get_random(self, type: str) -> Optional[Dict[str, Any]]: url = self._select_random_origin_visit_by_type(type) random_origin_visits = copy.deepcopy(self._origin_visits[url]) random_origin_visits.reverse() back_in_the_day = now() - timedelta(weeks=12) # 3 months back # This should be enough for tests for visit in random_origin_visits: if visit.date > back_in_the_day and visit.status == 'full': return visit.to_dict() else: return None def stat_counters(self): keys = ( 'content', 'directory', 'origin', 'origin_visit', 'person', 'release', 'revision', 'skipped_content', 'snapshot' ) stats = {key: 0 for key in keys} stats.update(collections.Counter( obj_type for (obj_type, obj_id) in itertools.chain(*self._objects.values()))) return stats def refresh_stat_counters(self): pass def origin_metadata_add(self, origin_url, ts, provider, tool, metadata): if not isinstance(origin_url, str): raise TypeError('origin_id must be str, not %r' % (origin_url,)) if isinstance(ts, str): ts = dateutil.parser.parse(ts) origin_metadata = { 'origin_url': origin_url, 'discovery_date': ts, 'tool_id': tool, 'metadata': metadata, 'provider_id': provider, } self._origin_metadata[origin_url].append(origin_metadata) return None def origin_metadata_get_by(self, origin_url, provider_type=None): if not isinstance(origin_url, str): raise TypeError('origin_url must be str, not %r' % (origin_url,)) metadata = [] for item in self._origin_metadata[origin_url]: item = copy.deepcopy(item) provider = self.metadata_provider_get(item['provider_id']) for attr_name in ('name', 'type', 'url'): item['provider_' + attr_name] = \ provider['provider_' + attr_name] metadata.append(item) return metadata def tool_add(self, tools): inserted = [] for tool in tools: key = self._tool_key(tool) assert 'id' not in tool record = copy.deepcopy(tool) record['id'] = key # TODO: remove this if key not in self._tools: self._tools[key] = record inserted.append(copy.deepcopy(self._tools[key])) return inserted def tool_get(self, tool): return self._tools.get(self._tool_key(tool)) def metadata_provider_add(self, provider_name, provider_type, provider_url, metadata): provider = { 'provider_name': provider_name, 'provider_type': provider_type, 'provider_url': provider_url, 'metadata': metadata, } key = self._metadata_provider_key(provider) provider['id'] = key self._metadata_providers[key] = provider return key def metadata_provider_get(self, provider_id): return self._metadata_providers.get(provider_id) def metadata_provider_get_by(self, provider): key = self._metadata_provider_key(provider) return self._metadata_providers.get(key) def _get_origin_url(self, origin): if isinstance(origin, str): return origin else: raise TypeError('origin must be a string.') def _person_add(self, person): key = ('person', person.fullname) if key not in self._objects: person_id = len(self._persons) + 1 self._persons.append(person) self._objects[key].append(('person', person_id)) else: person_id = self._objects[key][0][1] person = self._persons[person_id-1] return person @staticmethod def _content_key(content): """A stable key for a content""" return tuple(getattr(content, key) for key in sorted(DEFAULT_ALGORITHMS)) @staticmethod def _content_key_algorithm(content): """ A stable key and the algorithm for a content""" if isinstance(content, Content): content = content.to_dict() return tuple((content.get(key), key) for key in sorted(DEFAULT_ALGORITHMS)) @staticmethod def _tool_key(tool): return '%r %r %r' % (tool['name'], tool['version'], tuple(sorted(tool['configuration'].items()))) @staticmethod def _metadata_provider_key(provider): return '%r %r' % (provider['provider_name'], provider['provider_url']) def diff_directories(self, from_dir, to_dir, track_renaming=False): raise NotImplementedError('InMemoryStorage.diff_directories') def diff_revisions(self, from_rev, to_rev, track_renaming=False): raise NotImplementedError('InMemoryStorage.diff_revisions') def diff_revision(self, revision, track_renaming=False): raise NotImplementedError('InMemoryStorage.diff_revision') diff --git a/swh/storage/tests/algos/test_origin.py b/swh/storage/tests/algos/test_origin.py index d79a43d7..2083818a 100644 --- a/swh/storage/tests/algos/test_origin.py +++ b/swh/storage/tests/algos/test_origin.py @@ -1,74 +1,74 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -from swh.storage.in_memory import Storage +from swh.storage.in_memory import InMemoryStorage from swh.storage.algos.origin import iter_origins def assert_list_eq(left, right, msg=None): assert list(left) == list(right), msg def test_iter_origins(): - storage = Storage() + storage = InMemoryStorage() origins = storage.origin_add([ {'url': 'bar'}, {'url': 'qux'}, {'url': 'quuz'}, ]) assert_list_eq(iter_origins(storage), origins) assert_list_eq(iter_origins(storage, batch_size=1), origins) assert_list_eq(iter_origins(storage, batch_size=2), origins) for i in range(1, 5): assert_list_eq( iter_origins(storage, origin_from=i+1), origins[i:], i) assert_list_eq( iter_origins(storage, origin_from=i+1, batch_size=1), origins[i:], i) assert_list_eq( iter_origins(storage, origin_from=i+1, batch_size=2), origins[i:], i) for j in range(i, 5): assert_list_eq( iter_origins( storage, origin_from=i+1, origin_to=j+1), origins[i:j], (i, j)) assert_list_eq( iter_origins( storage, origin_from=i+1, origin_to=j+1, batch_size=1), origins[i:j], (i, j)) assert_list_eq( iter_origins( storage, origin_from=i+1, origin_to=j+1, batch_size=2), origins[i:j], (i, j)) -@patch('swh.storage.in_memory.Storage.origin_get_range') +@patch('swh.storage.in_memory.InMemoryStorage.origin_get_range') def test_iter_origins_batch_size(mock_origin_get_range): - storage = Storage() + storage = InMemoryStorage() mock_origin_get_range.return_value = [] list(iter_origins(storage)) mock_origin_get_range.assert_called_with( origin_from=1, origin_count=10000) list(iter_origins(storage, batch_size=42)) mock_origin_get_range.assert_called_with( origin_from=1, origin_count=42) diff --git a/swh/storage/tests/test_init.py b/swh/storage/tests/test_init.py index e95d4ea9..fabb3381 100644 --- a/swh/storage/tests/test_init.py +++ b/swh/storage/tests/test_init.py @@ -1,137 +1,137 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from unittest.mock import patch from swh.storage import get_storage from swh.storage.api.client import RemoteStorage from swh.storage.storage import Storage as DbStorage -from swh.storage.in_memory import Storage as MemoryStorage +from swh.storage.in_memory import InMemoryStorage from swh.storage.buffer import BufferingProxyStorage from swh.storage.filter import FilteringProxyStorage from swh.storage.retry import RetryingProxyStorage @patch('swh.storage.storage.psycopg2.pool') def test_get_storage(mock_pool): """Instantiating an existing storage should be ok """ mock_pool.ThreadedConnectionPool.return_value = None for cls, real_class, dummy_args in [ ('remote', RemoteStorage, {'url': 'url'}), - ('memory', MemoryStorage, {}), + ('memory', InMemoryStorage, {}), ('local', DbStorage, { 'db': 'postgresql://db', 'objstorage': { 'cls': 'memory', 'args': {}, }, }), ('filter', FilteringProxyStorage, {'storage': { 'cls': 'memory'} }), ('buffer', BufferingProxyStorage, {'storage': { 'cls': 'memory'} }), ('retry', RetryingProxyStorage, {'storage': { 'cls': 'memory'} }), ]: actual_storage = get_storage(cls, **dummy_args) assert actual_storage is not None assert isinstance(actual_storage, real_class) @patch('swh.storage.storage.psycopg2.pool') def test_get_storage_legacy_args(mock_pool): """Instantiating an existing storage should be ok even with the legacy explicit 'args' keys """ mock_pool.ThreadedConnectionPool.return_value = None for cls, real_class, dummy_args in [ ('remote', RemoteStorage, {'url': 'url'}), - ('memory', MemoryStorage, {}), + ('memory', InMemoryStorage, {}), ('local', DbStorage, { 'db': 'postgresql://db', 'objstorage': { 'cls': 'memory', 'args': {}, }, }), ('filter', FilteringProxyStorage, {'storage': { 'cls': 'memory', 'args': {}} }), ('buffer', BufferingProxyStorage, {'storage': { 'cls': 'memory', 'args': {}} }), ]: with pytest.warns(DeprecationWarning): actual_storage = get_storage(cls, args=dummy_args) assert actual_storage is not None assert isinstance(actual_storage, real_class) def test_get_storage_failure(): """Instantiating an unknown storage should raise """ with pytest.raises(ValueError, match='Unknown storage class `unknown`'): get_storage('unknown', args=[]) def test_get_storage_pipeline(): config = { 'cls': 'pipeline', 'steps': [ { 'cls': 'filter', }, { 'cls': 'buffer', 'min_batch_size': { 'content': 10, }, }, { 'cls': 'memory', } ] } storage = get_storage(**config) assert isinstance(storage, FilteringProxyStorage) assert isinstance(storage.storage, BufferingProxyStorage) - assert isinstance(storage.storage.storage, MemoryStorage) + assert isinstance(storage.storage.storage, InMemoryStorage) def test_get_storage_pipeline_legacy_args(): config = { 'cls': 'pipeline', 'steps': [ { 'cls': 'filter', }, { 'cls': 'buffer', 'args': { 'min_batch_size': { 'content': 10, }, } }, { 'cls': 'memory', } ] } with pytest.warns(DeprecationWarning): storage = get_storage(**config) assert isinstance(storage, FilteringProxyStorage) assert isinstance(storage.storage, BufferingProxyStorage) - assert isinstance(storage.storage.storage, MemoryStorage) + assert isinstance(storage.storage.storage, InMemoryStorage) diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py index 6843ab86..a2d7684a 100644 --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -1,885 +1,899 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import psycopg2 import pytest from typing import Dict from unittest.mock import call from swh.storage import HashCollision from swh.storage.retry import RetryingProxyStorage @pytest.fixture def swh_storage(): return RetryingProxyStorage(storage={'cls': 'memory'}) def test_retrying_proxy_storage_content_add(swh_storage, sample_data): """Standard content_add works as before """ sample_content = sample_data['content'][0] content = next(swh_storage.content_get([sample_content['sha1']])) assert not content s = swh_storage.content_add([sample_content]) assert s == { 'content:add': 1, 'content:add:bytes': sample_content['length'], 'skipped_content:add': 0 } content = next(swh_storage.content_get([sample_content['sha1']])) assert content['sha1'] == sample_content['sha1'] def test_retrying_proxy_storage_content_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.content_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.content_add') mock_memory.side_effect = [ # first try goes ko HashCollision('content hash collision'), # second try goes ko psycopg2.IntegrityError('content already inserted'), # ok then! {'content:add': 1} ] sample_content = sample_data['content'][0] content = next(swh_storage.content_get([sample_content['sha1']])) assert not content s = swh_storage.content_add([sample_content]) assert s == {'content:add': 1} assert mock_memory.has_calls([ call([sample_content]), call([sample_content]), call([sample_content]), ]) def test_retrying_proxy_swh_storage_content_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.content_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.content_add') mock_memory.side_effect = ValueError('Refuse to add content always!') sample_content = sample_data['content'][0] content = next(swh_storage.content_get([sample_content['sha1']])) assert not content with pytest.raises(ValueError, match='Refuse to add'): swh_storage.content_add([sample_content]) assert mock_memory.call_count == 1 def test_retrying_proxy_storage_content_add_metadata(swh_storage, sample_data): """Standard content_add_metadata works as before """ sample_content = sample_data['content_metadata'][0] pk = sample_content['sha1'] content_metadata = swh_storage.content_get_metadata([pk]) assert not content_metadata[pk] s = swh_storage.content_add_metadata([sample_content]) assert s == { 'content:add': 1, 'skipped_content:add': 0 } content_metadata = swh_storage.content_get_metadata([pk]) assert len(content_metadata[pk]) == 1 assert content_metadata[pk][0]['sha1'] == pk def test_retrying_proxy_storage_content_add_metadata_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.content_add_metadata') + 'swh.storage.in_memory.InMemoryStorage.content_add_metadata') mock_memory.side_effect = [ # first try goes ko HashCollision('content_metadata hash collision'), # second try goes ko psycopg2.IntegrityError('content_metadata already inserted'), # ok then! {'content:add': 1} ] sample_content = sample_data['content_metadata'][0] s = swh_storage.content_add_metadata([sample_content]) assert s == {'content:add': 1} assert mock_memory.has_calls([ call([sample_content]), call([sample_content]), call([sample_content]), ]) def test_retrying_proxy_swh_storage_content_add_metadata_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.content_add_metadata') + 'swh.storage.in_memory.InMemoryStorage.content_add_metadata') mock_memory.side_effect = ValueError('Refuse to add content_metadata!') sample_content = sample_data['content_metadata'][0] pk = sample_content['sha1'] content_metadata = swh_storage.content_get_metadata([pk]) assert not content_metadata[pk] with pytest.raises(ValueError, match='Refuse to add'): swh_storage.content_add_metadata([sample_content]) assert mock_memory.call_count == 1 def test_retrying_proxy_swh_storage_origin_add_one(swh_storage, sample_data): """Standard origin_add_one works as before """ sample_origin = sample_data['origin'][0] origin = swh_storage.origin_get(sample_origin) assert not origin r = swh_storage.origin_add_one(sample_origin) assert r == sample_origin['url'] origin = swh_storage.origin_get(sample_origin) assert origin['url'] == sample_origin['url'] def test_retrying_proxy_swh_storage_origin_add_one_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ sample_origin = sample_data['origin'][1] - mock_memory = mocker.patch('swh.storage.in_memory.Storage.origin_add_one') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.origin_add_one') mock_memory.side_effect = [ # first try goes ko HashCollision('origin hash collision'), # second try goes ko psycopg2.IntegrityError('origin already inserted'), # ok then! sample_origin['url'] ] origin = swh_storage.origin_get(sample_origin) assert not origin r = swh_storage.origin_add_one(sample_origin) assert r == sample_origin['url'] assert mock_memory.has_calls([ call([sample_origin]), call([sample_origin]), call([sample_origin]), ]) def test_retrying_proxy_swh_storage_origin_add_one_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.origin_add_one') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.origin_add_one') mock_memory.side_effect = ValueError('Refuse to add origin always!') sample_origin = sample_data['origin'][0] origin = swh_storage.origin_get(sample_origin) assert not origin with pytest.raises(ValueError, match='Refuse to add'): swh_storage.origin_add_one([sample_origin]) assert mock_memory.call_count == 1 def test_retrying_proxy_swh_storage_origin_visit_add(swh_storage, sample_data): """Standard origin_visit_add works as before """ sample_origin = sample_data['origin'][0] swh_storage.origin_add_one(sample_origin) origin_url = sample_origin['url'] origin = list(swh_storage.origin_visit_get(origin_url)) assert not origin origin_visit = swh_storage.origin_visit_add( origin_url, '2020-01-01', 'hg') assert origin_visit['origin'] == origin_url assert isinstance(origin_visit['visit'], int) origin_visit = next(swh_storage.origin_visit_get(origin_url)) assert origin_visit['origin'] == origin_url assert isinstance(origin_visit['visit'], int) def test_retrying_proxy_swh_storage_origin_visit_add_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ sample_origin = sample_data['origin'][1] swh_storage.origin_add_one(sample_origin) origin_url = sample_origin['url'] mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.origin_visit_add') + 'swh.storage.in_memory.InMemoryStorage.origin_visit_add') mock_memory.side_effect = [ # first try goes ko HashCollision('origin hash collision'), # second try goes ko psycopg2.IntegrityError('origin already inserted'), # ok then! {'origin': origin_url, 'visit': 1} ] origin = list(swh_storage.origin_visit_get(origin_url)) assert not origin r = swh_storage.origin_visit_add(sample_origin, '2020-01-01', 'git') assert r == {'origin': origin_url, 'visit': 1} assert mock_memory.has_calls([ call(sample_origin, '2020-01-01', 'git'), call(sample_origin, '2020-01-01', 'git'), call(sample_origin, '2020-01-01', 'git') ]) def test_retrying_proxy_swh_storage_origin_visit_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.origin_visit_add') + 'swh.storage.in_memory.InMemoryStorage.origin_visit_add') mock_memory.side_effect = ValueError('Refuse to add origin always!') origin_url = sample_data['origin'][0]['url'] origin = list(swh_storage.origin_visit_get(origin_url)) assert not origin with pytest.raises(ValueError, match='Refuse to add'): swh_storage.origin_visit_add(origin_url, '2020-01-31', 'svn') assert mock_memory.has_calls([ call(origin_url, '2020-01-31', 'svn'), ]) def test_retrying_proxy_storage_tool_add(swh_storage, sample_data): """Standard tool_add works as before """ sample_tool = sample_data['tool'][0] tool = swh_storage.tool_get(sample_tool) assert not tool tools = swh_storage.tool_add([sample_tool]) assert tools tool = tools[0] tool.pop('id') assert tool == sample_tool tool = swh_storage.tool_get(sample_tool) tool.pop('id') assert tool == sample_tool def test_retrying_proxy_storage_tool_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ sample_tool = sample_data['tool'][0] - mock_memory = mocker.patch('swh.storage.in_memory.Storage.tool_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.tool_add') mock_memory.side_effect = [ # first try goes ko HashCollision('tool hash collision'), # second try goes ko psycopg2.IntegrityError('tool already inserted'), # ok then! [sample_tool] ] tool = swh_storage.tool_get(sample_tool) assert not tool tools = swh_storage.tool_add([sample_tool]) assert tools == [sample_tool] assert mock_memory.has_calls([ call([sample_tool]), call([sample_tool]), call([sample_tool]), ]) def test_retrying_proxy_swh_storage_tool_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.tool_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.tool_add') mock_memory.side_effect = ValueError('Refuse to add tool always!') sample_tool = sample_data['tool'][0] tool = swh_storage.tool_get(sample_tool) assert not tool with pytest.raises(ValueError, match='Refuse to add'): swh_storage.tool_add([sample_tool]) assert mock_memory.call_count == 1 def to_provider(provider: Dict) -> Dict: return { 'provider_name': provider['name'], 'provider_url': provider['url'], 'provider_type': provider['type'], 'metadata': provider['metadata'], } def test_retrying_proxy_storage_metadata_provider_add( swh_storage, sample_data): """Standard metadata_provider_add works as before """ provider = sample_data['provider'][0] provider_get = to_provider(provider) provider = swh_storage.metadata_provider_get_by(provider_get) assert not provider provider_id = swh_storage.metadata_provider_add(**provider_get) assert provider_id actual_provider = swh_storage.metadata_provider_get(provider_id) assert actual_provider actual_provider_id = actual_provider.pop('id') assert actual_provider_id == provider_id assert actual_provider == provider_get def test_retrying_proxy_storage_metadata_provider_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ provider = sample_data['provider'][0] provider_get = to_provider(provider) mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.metadata_provider_add') + 'swh.storage.in_memory.InMemoryStorage.metadata_provider_add') mock_memory.side_effect = [ # first try goes ko HashCollision('provider_id hash collision'), # second try goes ko psycopg2.IntegrityError('provider_id already inserted'), # ok then! 'provider_id', ] provider = swh_storage.metadata_provider_get_by(provider_get) assert not provider provider_id = swh_storage.metadata_provider_add(**provider_get) assert provider_id == 'provider_id' assert mock_memory.has_calls([ call(**provider_get), call(**provider_get), call(**provider_get), ]) def test_retrying_proxy_swh_storage_metadata_provider_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.metadata_provider_add') + 'swh.storage.in_memory.InMemoryStorage.metadata_provider_add') mock_memory.side_effect = ValueError('Refuse to add provider_id always!') provider = sample_data['provider'][0] provider_get = to_provider(provider) provider_id = swh_storage.metadata_provider_get_by(provider_get) assert not provider_id with pytest.raises(ValueError, match='Refuse to add'): swh_storage.metadata_provider_add(**provider_get) assert mock_memory.call_count == 1 def test_retrying_proxy_storage_origin_metadata_add( swh_storage, sample_data): """Standard origin_metadata_add works as before """ ori_meta = sample_data['origin_metadata'][0] origin = ori_meta['origin'] swh_storage.origin_add_one(origin) provider_get = to_provider(ori_meta['provider']) provider_id = swh_storage.metadata_provider_add(**provider_get) origin_metadata = swh_storage.origin_metadata_get_by(origin['url']) assert not origin_metadata swh_storage.origin_metadata_add( origin['url'], ori_meta['discovery_date'], provider_id, ori_meta['tool'], ori_meta['metadata']) origin_metadata = swh_storage.origin_metadata_get_by( origin['url']) assert origin_metadata def test_retrying_proxy_storage_origin_metadata_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ ori_meta = sample_data['origin_metadata'][0] origin = ori_meta['origin'] swh_storage.origin_add_one(origin) provider_get = to_provider(ori_meta['provider']) provider_id = swh_storage.metadata_provider_add(**provider_get) mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.origin_metadata_add') + 'swh.storage.in_memory.InMemoryStorage.origin_metadata_add') mock_memory.side_effect = [ # first try goes ko HashCollision('provider_id hash collision'), # second try goes ko psycopg2.IntegrityError('provider_id already inserted'), # ok then! None ] url = origin['url'] ts = ori_meta['discovery_date'] tool_id = ori_meta['tool'] metadata = ori_meta['metadata'] # No exception raised as insertion finally came through swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata) mock_memory.assert_has_calls([ # 3 calls, as long as error raised call(url, ts, provider_id, tool_id, metadata), call(url, ts, provider_id, tool_id, metadata), call(url, ts, provider_id, tool_id, metadata) ]) def test_retrying_proxy_swh_storage_origin_metadata_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.origin_metadata_add') + 'swh.storage.in_memory.InMemoryStorage.origin_metadata_add') mock_memory.side_effect = ValueError('Refuse to add always!') ori_meta = sample_data['origin_metadata'][0] origin = ori_meta['origin'] swh_storage.origin_add_one(origin) url = origin['url'] ts = ori_meta['discovery_date'] provider_id = 'provider_id' tool_id = ori_meta['tool'] metadata = ori_meta['metadata'] with pytest.raises(ValueError, match='Refuse to add'): swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata) assert mock_memory.call_count == 1 def test_retrying_proxy_swh_storage_origin_visit_update( swh_storage, sample_data): """Standard origin_visit_update works as before """ sample_origin = sample_data['origin'][0] swh_storage.origin_add_one(sample_origin) origin_url = sample_origin['url'] origin_visit = swh_storage.origin_visit_add( origin_url, '2020-01-01', 'hg') ov = next(swh_storage.origin_visit_get(origin_url)) assert ov['origin'] == origin_url assert ov['visit'] == origin_visit['visit'] assert ov['status'] == 'ongoing' assert ov['snapshot'] is None assert ov['metadata'] is None swh_storage.origin_visit_update(origin_url, ov['visit'], status='full') ov = next(swh_storage.origin_visit_get(origin_url)) assert ov['origin'] == origin_url assert ov['visit'] == origin_visit['visit'] assert ov['status'] == 'full' assert ov['snapshot'] is None assert ov['metadata'] is None def test_retrying_proxy_swh_storage_origin_visit_update_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ sample_origin = sample_data['origin'][1] origin_url = sample_origin['url'] mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.origin_visit_update') + 'swh.storage.in_memory.InMemoryStorage.origin_visit_update') mock_memory.side_effect = [ # first try goes ko HashCollision('origin hash collision'), # second try goes ko psycopg2.IntegrityError('origin already inserted'), # ok then! {'origin': origin_url, 'visit': 1} ] visit_id = 1 swh_storage.origin_visit_update(origin_url, visit_id, status='full') assert mock_memory.has_calls([ call(origin_url, visit_id, status='full'), call(origin_url, visit_id, status='full'), call(origin_url, visit_id, status='full'), ]) def test_retrying_proxy_swh_storage_origin_visit_update_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ mock_memory = mocker.patch( - 'swh.storage.in_memory.Storage.origin_visit_update') + 'swh.storage.in_memory.InMemoryStorage.origin_visit_update') mock_memory.side_effect = ValueError('Refuse to add origin always!') origin_url = sample_data['origin'][0]['url'] visit_id = 9 with pytest.raises(ValueError, match='Refuse to add'): swh_storage.origin_visit_update(origin_url, visit_id, 'partial') assert mock_memory.call_count == 1 def test_retrying_proxy_storage_directory_add(swh_storage, sample_data): """Standard directory_add works as before """ sample_dir = sample_data['directory'][0] directory = swh_storage.directory_get_random() # no directory assert not directory s = swh_storage.directory_add([sample_dir]) assert s == { 'directory:add': 1, } directory_id = swh_storage.directory_get_random() # only 1 assert directory_id == sample_dir['id'] def test_retrying_proxy_storage_directory_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.directory_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.directory_add') mock_memory.side_effect = [ # first try goes ko HashCollision('directory hash collision'), # second try goes ko psycopg2.IntegrityError('directory already inserted'), # ok then! {'directory:add': 1} ] sample_dir = sample_data['directory'][1] directory_id = swh_storage.directory_get_random() # no directory assert not directory_id s = swh_storage.directory_add([sample_dir]) assert s == { 'directory:add': 1, } assert mock_memory.has_calls([ call([sample_dir]), call([sample_dir]), call([sample_dir]), ]) def test_retrying_proxy_swh_storage_directory_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.directory_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.directory_add') mock_memory.side_effect = ValueError('Refuse to add directory always!') sample_dir = sample_data['directory'][0] directory_id = swh_storage.directory_get_random() # no directory assert not directory_id with pytest.raises(ValueError, match='Refuse to add'): swh_storage.directory_add([sample_dir]) assert mock_memory.call_count == 1 def test_retrying_proxy_storage_revision_add(swh_storage, sample_data): """Standard revision_add works as before """ sample_rev = sample_data['revision'][0] revision = next(swh_storage.revision_get([sample_rev['id']])) assert not revision s = swh_storage.revision_add([sample_rev]) assert s == { 'revision:add': 1, } revision = next(swh_storage.revision_get([sample_rev['id']])) assert revision['id'] == sample_rev['id'] def test_retrying_proxy_storage_revision_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.revision_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.revision_add') mock_memory.side_effect = [ # first try goes ko HashCollision('revision hash collision'), # second try goes ko psycopg2.IntegrityError('revision already inserted'), # ok then! {'revision:add': 1} ] sample_rev = sample_data['revision'][0] revision = next(swh_storage.revision_get([sample_rev['id']])) assert not revision s = swh_storage.revision_add([sample_rev]) assert s == { 'revision:add': 1, } assert mock_memory.has_calls([ call([sample_rev]), call([sample_rev]), call([sample_rev]), ]) def test_retrying_proxy_swh_storage_revision_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.revision_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.revision_add') mock_memory.side_effect = ValueError('Refuse to add revision always!') sample_rev = sample_data['revision'][0] revision = next(swh_storage.revision_get([sample_rev['id']])) assert not revision with pytest.raises(ValueError, match='Refuse to add'): swh_storage.revision_add([sample_rev]) assert mock_memory.call_count == 1 def test_retrying_proxy_storage_release_add(swh_storage, sample_data): """Standard release_add works as before """ sample_rel = sample_data['release'][0] release = next(swh_storage.release_get([sample_rel['id']])) assert not release s = swh_storage.release_add([sample_rel]) assert s == { 'release:add': 1, } release = next(swh_storage.release_get([sample_rel['id']])) assert release['id'] == sample_rel['id'] def test_retrying_proxy_storage_release_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.release_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.release_add') mock_memory.side_effect = [ # first try goes ko HashCollision('release hash collision'), # second try goes ko psycopg2.IntegrityError('release already inserted'), # ok then! {'release:add': 1} ] sample_rel = sample_data['release'][0] release = next(swh_storage.release_get([sample_rel['id']])) assert not release s = swh_storage.release_add([sample_rel]) assert s == { 'release:add': 1, } assert mock_memory.has_calls([ call([sample_rel]), call([sample_rel]), call([sample_rel]), ]) def test_retrying_proxy_swh_storage_release_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.release_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.release_add') mock_memory.side_effect = ValueError('Refuse to add release always!') sample_rel = sample_data['release'][0] release = next(swh_storage.release_get([sample_rel['id']])) assert not release with pytest.raises(ValueError, match='Refuse to add'): swh_storage.release_add([sample_rel]) assert mock_memory.call_count == 1 def test_retrying_proxy_storage_snapshot_add(swh_storage, sample_data): """Standard snapshot_add works as before """ sample_snap = sample_data['snapshot'][0] snapshot = swh_storage.snapshot_get(sample_snap['id']) assert not snapshot s = swh_storage.snapshot_add([sample_snap]) assert s == { 'snapshot:add': 1, } snapshot = swh_storage.snapshot_get(sample_snap['id']) assert snapshot['id'] == sample_snap['id'] def test_retrying_proxy_storage_snapshot_add_with_retry( swh_storage, sample_data, mocker): """Multiple retries for hash collision and psycopg2 error but finally ok """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.snapshot_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.snapshot_add') mock_memory.side_effect = [ # first try goes ko HashCollision('snapshot hash collision'), # second try goes ko psycopg2.IntegrityError('snapshot already inserted'), # ok then! {'snapshot:add': 1} ] sample_snap = sample_data['snapshot'][0] snapshot = swh_storage.snapshot_get(sample_snap['id']) assert not snapshot s = swh_storage.snapshot_add([sample_snap]) assert s == { 'snapshot:add': 1, } assert mock_memory.has_calls([ call([sample_snap]), call([sample_snap]), call([sample_snap]), ]) def test_retrying_proxy_swh_storage_snapshot_add_failure( swh_storage, sample_data, mocker): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch('swh.storage.in_memory.Storage.snapshot_add') + mock_memory = mocker.patch( + 'swh.storage.in_memory.InMemoryStorage.snapshot_add') mock_memory.side_effect = ValueError('Refuse to add snapshot always!') sample_snap = sample_data['snapshot'][0] snapshot = swh_storage.snapshot_get(sample_snap['id']) assert not snapshot with pytest.raises(ValueError, match='Refuse to add'): swh_storage.snapshot_add([sample_snap]) assert mock_memory.call_count == 1