diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -6,13 +6,13 @@ from swh.core.api import RPCClient from ..exc import StorageAPIError -from ..storage import Storage +from ..interface import StorageInterface class RemoteStorage(RPCClient): """Proxy to a remote storage API""" api_exception = StorageAPIError - backend_class = Storage + backend_class = StorageInterface def reset(self): return self.post('reset', {}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -12,7 +12,7 @@ error_handler, encode_data_server as encode_data) -from ..storage import Storage +from ..interface import StorageInterface from ..metrics import timed @@ -25,7 +25,7 @@ app = RPCServerApp(__name__, - backend_class=Storage, + backend_class=StorageInterface, backend_factory=get_storage) storage = None diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -72,7 +72,6 @@ self.objstorage = get_objstorage('memory', {}) def check_config(self, *, check_write): - """Check that the storage is configured and ready to go.""" return True def _content_add(self, contents, with_data): @@ -159,66 +158,18 @@ return count def _content_to_model(self, contents): - """Takes a list of content dicts, optionally with an extra 'origin' - key, and yields tuples (model.Content, origin).""" for content in contents: content = content.copy() content.pop('origin', None) yield Content.from_dict(content) def content_add(self, content): - """Add content blobs to the storage - - Args: - content (iterable): iterable of dictionaries representing - individual pieces of content to add. Each dictionary has the - following keys: - - - data (bytes): the actual content - - length (int): content length (default: -1) - - one key for each checksum algorithm in - :data:`swh.model.hashutil.DEFAULT_ALGORITHMS`, mapped to the - corresponding checksum - - status (str): one of visible, hidden, absent - - reason (str): if status = absent, the reason why - - origin (int): if status = absent, the origin we saw the - content in - - Raises: - HashCollision in case of collision - - Returns: - Summary dict with the following key and associated values: - - content:add: New contents added - content_bytes:add: Sum of the contents' length data - skipped_content:add: New skipped contents (no data) added - - """ now = datetime.datetime.now(tz=datetime.timezone.utc) content = [attr.evolve(c, ctime=now) for c in self._content_to_model(content)] return self._content_add(content, with_data=True) def content_update(self, content, keys=[]): - """Update content blobs to the storage. Does nothing for unknown - contents or skipped ones. - - Args: - content (iterable): iterable of dictionaries representing - individual pieces of content to update. Each dictionary has the - following keys: - - - data (bytes): the actual content - - length (int): content length (default: -1) - - one key for each checksum algorithm in - :data:`swh.model.hashutil.ALGORITHMS`, mapped to the - corresponding checksum - - status (str): one of visible, hidden, absent - - keys (list): List of keys (str) whose values needs an update, e.g., - new hash column - """ if self.journal_writer: raise NotImplementedError( 'content_update is not yet supported with a journal_writer.') @@ -243,58 +194,10 @@ self._content_indexes[algorithm][hash_].add(new_key) def content_add_metadata(self, content): - """Add content metadata to the storage (like `content_add`, but - without inserting to the objstorage). - - Args: - content (iterable): iterable of dictionaries representing - individual pieces of content to add. Each dictionary has the - following keys: - - - length (int): content length (default: -1) - - one key for each checksum algorithm in - :data:`swh.model.hashutil.DEFAULT_ALGORITHMS`, mapped to the - corresponding checksum - - status (str): one of visible, hidden, absent - - reason (str): if status = absent, the reason why - - origin (int): if status = absent, the origin we saw the - content in - - ctime (datetime): time of insertion in the archive - - Raises: - HashCollision in case of collision - - Returns: - Summary dict with the following key and associated values: - - content:add: New contents added - skipped_content:add: New skipped contents (no data) added - - """ content = list(self._content_to_model(content)) return self._content_add(content, with_data=False) def content_get(self, content): - """Retrieve in bulk contents and their data. - - This function may yield more blobs than provided sha1 identifiers, - in case they collide. - - Args: - content: iterables of sha1 - - Yields: - Dict[str, bytes]: Generates streams of contents as dict with their - raw data: - - - sha1 (bytes): content id - - data (bytes): content's raw data - - Raises: - ValueError in case of too much contents are required. - cf. BULK_BLOCK_CONTENT_LEN_MAX - - """ # FIXME: Make this method support slicing the `data`. if len(content) > BULK_BLOCK_CONTENT_LEN_MAX: raise ValueError( @@ -309,26 +212,6 @@ yield {'sha1': obj_id, 'data': data} def content_get_range(self, start, end, limit=1000): - """Retrieve contents within range [start, end] bound by limit. - - Note that this function may return more than one blob per hash. The - limit is enforced with multiplicity (ie. two blobs with the same hash - will count twice toward the limit). - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **limit** (int): Limit result (default to 1000) - - Returns: - a dict with keys: - - contents [dict]: iterable of contents in between the range. - - next (bytes): There remains content in the range - starting from this next sha1 - - """ if limit is None: raise ValueError('Development error: limit should not be None') from_index = bisect.bisect_left(self._sorted_sha1s, start) @@ -353,25 +236,6 @@ def content_get_partition( self, partition_id: int, nb_partitions: int, limit: int = 1000, page_token: str = None): - """Splits contents into nb_partitions, and returns one of these based on - partition_id (which must be in [0, nb_partitions-1]) - - There is no guarantee on how the partitioning is done, or the - result order. - - Args: - partition_id (int): index of the partition to fetch - nb_partitions (int): total number of partitions to split into - limit (int): Limit result (default to 1000) - page_token (Optional[str]): opaque token used for pagination. - - Returns: - a dict with keys: - - contents (List[dict]): iterable of contents in the partition. - - **next_page_token** (Optional[str]): opaque token to be used as - `page_token` for retrieving the next page. if absent, there is - no more pages to gather. - """ if limit is None: raise ValueError('Development error: limit should not be None') (start, end) = get_partition_bounds_bytes( @@ -391,17 +255,6 @@ def content_get_metadata( self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: - """Retrieve content metadata in bulk - - Args: - content: iterable of content identifiers (sha1) - - Returns: - a dict with keys the content's sha1 and the associated value - either the existing content's metadata or None if the content does - not exist. - - """ result: Dict = {sha1: [] for sha1 in contents} for sha1 in contents: if sha1 in self._content_indexes['sha1']: @@ -432,22 +285,6 @@ return [self._contents[key].to_dict() for key in keys] def content_missing(self, content, key_hash='sha1'): - """List content missing from storage - - Args: - contents ([dict]): iterable of dictionaries whose keys are - either 'length' or an item of - :data:`swh.model.hashutil.ALGORITHMS`; - mapped to the corresponding checksum - (or length). - - key_hash (str): name of the column to use as hash id - result (default: 'sha1') - - Returns: - iterable ([bytes]): missing content ids (as per the - key_hash column) - """ for cont in content: for (algo, hash_) in cont.items(): if algo not in DEFAULT_ALGORITHMS: @@ -461,45 +298,16 @@ yield cont[key_hash] def content_missing_per_sha1(self, contents): - """List content missing from storage based only on sha1. - - Args: - contents: Iterable of sha1 to check for absence. - - Returns: - iterable: missing ids - - Raises: - TODO: an exception when we get a hash collision. - - """ for content in contents: if content not in self._content_indexes['sha1']: yield content def content_missing_per_sha1_git(self, contents): - """List content missing from storage based only on sha1_git. - - Args: - contents: An iterable of content id (sha1_git) - - Yields: - missing contents sha1_git - """ for content in contents: if content not in self._content_indexes['sha1_git']: yield content def skipped_content_missing(self, contents): - """List all skipped_content missing from storage - - Args: - contents: Iterable of sha1 to check for skipped content entry - - Returns: - iterable: dict of skipped content entry - """ - for content in contents: for (key, algorithm) in self._content_key_algorithm(content): if algorithm == 'blake2s256': @@ -511,37 +319,9 @@ break def content_get_random(self): - """Finds a random content id. - - Returns: - a sha1_git - """ return random.choice(list(self._content_indexes['sha1_git'])) def directory_add(self, directories): - """Add directories to the storage - - Args: - directories (iterable): iterable of dictionaries representing the - individual directories to add. Each dict has the following - keys: - - - id (sha1_git): the id of the directory to add - - entries (list): list of dicts for each entry in the - directory. Each dict has the following keys: - - - name (bytes) - - type (one of 'file', 'dir', 'rev'): type of the - directory entry (file, directory, revision) - - target (sha1_git): id of the object pointed at by the - directory entry - - perms (int): entry permissions - Returns: - Summary dict of keys with associated count as values: - - directory:add: Number of directories actually added - - """ directories = list(directories) if self.journal_writer: self.journal_writer.write_additions( @@ -562,15 +342,6 @@ return {'directory:add': count} def directory_missing(self, directories): - """List directories missing from storage - - Args: - directories (iterable): an iterable of directory ids - - Yields: - missing directory ids - - """ for id in directories: if id not in self._directories: yield id @@ -606,41 +377,12 @@ ret['target'], True, prefix + ret['name'] + b'/') def directory_ls(self, directory, recursive=False): - """Get entries for one directory. - - Args: - - directory: the directory to list entries from. - - recursive: if flag on, this list recursively from this directory. - - Returns: - List of entries for such directory. - - If `recursive=True`, names in the path of a dir/file not at the - root are concatenated with a slash (`/`). - """ yield from self._directory_ls(directory, recursive) def directory_entry_get_by_path(self, directory, paths): - """Get the directory entry (either file or dir) from directory with path. - - Args: - - directory: sha1 of the top level directory - - paths: path to lookup from the top level directory. From left - (top) to right (bottom). - - Returns: - The corresponding directory entry if found, None otherwise. - - """ return self._directory_entry_get_by_path(directory, paths, b'') def directory_get_random(self): - """Finds a random directory id. - - Returns: - a sha1_git if any - - """ if not self._directories: return None return random.choice(list(self._directories)) @@ -673,42 +415,6 @@ first_item['target'], paths[1:], prefix + paths[0] + b'/') def revision_add(self, revisions): - """Add revisions to the storage - - Args: - revisions (Iterable[dict]): iterable of dictionaries representing - the individual revisions to add. Each dict has the following - keys: - - - **id** (:class:`sha1_git`): id of the revision to add - - **date** (:class:`dict`): date the revision was written - - **committer_date** (:class:`dict`): date the revision got - added to the origin - - **type** (one of 'git', 'tar'): type of the - revision added - - **directory** (:class:`sha1_git`): the directory the - revision points at - - **message** (:class:`bytes`): the message associated with - the revision - - **author** (:class:`Dict[str, bytes]`): dictionary with - keys: name, fullname, email - - **committer** (:class:`Dict[str, bytes]`): dictionary with - keys: name, fullname, email - - **metadata** (:class:`jsonb`): extra information as - dictionary - - **synthetic** (:class:`bool`): revision's nature (tarball, - directory creates synthetic revision`) - - **parents** (:class:`list[sha1_git]`): the parents of - this revision - - date dictionaries have the form defined in :mod:`swh.model`. - - Returns: - Summary dict of keys with associated count as values - - revision_added: New objects actually stored in db - - """ revisions = list(revisions) if self.journal_writer: self.journal_writer.write_additions( @@ -733,15 +439,6 @@ return {'revision:add': count} def revision_missing(self, revisions): - """List revisions missing from storage - - Args: - revisions (iterable): revision ids - - Yields: - missing revision ids - - """ for id in revisions: if id not in self._revisions: yield id @@ -764,68 +461,18 @@ yield from self._get_parent_revs(parent, seen, limit) def revision_log(self, revisions, limit=None): - """Fetch revision entry from the given root revisions. - - Args: - revisions: array of root revision to lookup - limit: limitation on the output result. Default to None. - - Yields: - List of revision log from such revisions root. - - """ seen = set() for rev_id in revisions: yield from self._get_parent_revs(rev_id, seen, limit) def revision_shortlog(self, revisions, limit=None): - """Fetch the shortlog for the given revisions - - Args: - revisions: list of root revisions to lookup - limit: depth limitation for the output - - Yields: - a list of (id, parents) tuples. - - """ yield from ((rev['id'], rev['parents']) for rev in self.revision_log(revisions, limit)) def revision_get_random(self): - """Finds a random revision id. - - Returns: - a sha1_git - """ return random.choice(list(self._revisions)) def release_add(self, releases): - """Add releases to the storage - - Args: - releases (Iterable[dict]): iterable of dictionaries representing - the individual releases to add. Each dict has the following - keys: - - - **id** (:class:`sha1_git`): id of the release to add - - **revision** (:class:`sha1_git`): id of the revision the - release points to - - **date** (:class:`dict`): the date the release was made - - **name** (:class:`bytes`): the name of the release - - **comment** (:class:`bytes`): the comment associated with - the release - - **author** (:class:`Dict[str, bytes]`): dictionary with - keys: name, fullname, email - - the date dictionary has the form defined in :mod:`swh.model`. - - Returns: - Summary dict of keys with associated count as values - - release:add: New objects contents actually stored in db - - """ releases = list(releases) if self.journal_writer: self.journal_writer.write_additions( @@ -848,28 +495,9 @@ return {'release:add': count} def release_missing(self, releases): - """List releases missing from storage - - Args: - releases: an iterable of release ids - - Returns: - a list of missing release ids - - """ yield from (rel for rel in releases if rel not in self._releases) def release_get(self, releases): - """Given a list of sha1, return the releases's information - - Args: - releases: list of sha1s - - Yields: - dicts with the same keys as those given to `release_add` - (or ``None`` if a release does not exist) - - """ for rel_id in releases: if rel_id in self._releases: yield self._releases[rel_id].to_dict() @@ -877,42 +505,9 @@ yield None def release_get_random(self): - """Finds a random release id. - - Returns: - a sha1_git - """ return random.choice(list(self._releases)) def snapshot_add(self, snapshots): - """Add a snapshot to the storage - - Args: - snapshot ([dict]): the snapshots to add, containing the - following keys: - - - **id** (:class:`bytes`): id of the snapshot - - **branches** (:class:`dict`): branches the snapshot contains, - mapping the branch name (:class:`bytes`) to the branch target, - itself a :class:`dict` (or ``None`` if the branch points to an - unknown object) - - - **target_type** (:class:`str`): one of ``content``, - ``directory``, ``revision``, ``release``, - ``snapshot``, ``alias`` - - **target** (:class:`bytes`): identifier of the target - (currently a ``sha1_git`` for all object kinds, or the name - of the target branch for aliases) - - Raises: - ValueError: if the origin's or visit's identifier does not exist. - - Returns: - Summary dict of keys with associated count as values - - snapshot_added: Count of object actually stored in db - - """ count = 0 snapshots = (Snapshot.from_dict(d) for d in snapshots) snapshots = (snap for snap in snapshots @@ -929,67 +524,14 @@ return {'snapshot:add': count} def snapshot_missing(self, snapshots): - """List snapshot missing from storage - - Args: - snapshots (iterable): an iterable of snapshot ids - - Yields: - missing snapshot ids - """ for id in snapshots: if id not in self._snapshots: yield id def snapshot_get(self, snapshot_id): - """Get the content, possibly partial, of a snapshot with the given id - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. - - Args: - snapshot_id (bytes): identifier of the snapshot - Returns: - dict: a dict with three keys: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - """ return self.snapshot_get_branches(snapshot_id) def snapshot_get_by_origin_visit(self, origin, visit): - """Get the content, possibly partial, of a snapshot for the given origin visit - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. - - Args: - origin (int): the origin's identifier - visit (int): the visit's identifier - Returns: - dict: None if the snapshot does not exist; - a dict with three keys otherwise: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - - """ origin_url = self._get_origin_url(origin) if not origin_url: return @@ -1004,33 +546,6 @@ return None def snapshot_get_latest(self, origin, allowed_statuses=None): - """Get the content, possibly partial, of the latest snapshot for the - given origin, optionally only from visits that have one of the given - allowed_statuses - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the methods :meth:`origin_visit_get_latest` - and :meth:`snapshot_get_branches` should be used instead. - - Args: - origin (str): the origin's URL - allowed_statuses (list of str): list of visit statuses considered - to find the latest snapshot for the origin. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. - Returns: - dict: a dict with three keys: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - """ origin_url = self._get_origin_url(origin) if not origin_url: return @@ -1047,46 +562,12 @@ return snapshot def snapshot_count_branches(self, snapshot_id): - """Count the number of branches in the snapshot with the given id - - Args: - snapshot_id (bytes): identifier of the snapshot - - Returns: - dict: A dict whose keys are the target types of branches and - values their corresponding amount - """ (snapshot, _) = self._snapshots[snapshot_id] return collections.Counter(branch.target_type.value if branch else None for branch in snapshot.branches.values()) def snapshot_get_branches(self, snapshot_id, branches_from=b'', branches_count=1000, target_types=None): - """Get the content, possibly partial, of a snapshot with the given id - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - Args: - snapshot_id (bytes): identifier of the snapshot - branches_from (bytes): optional parameter used to skip branches - whose name is lesser than it before returning them - branches_count (int): optional parameter used to restrain - the amount of returned branches - target_types (list): optional parameter used to filter the - target types of branch to return (possible values that can be - contained in that list are `'content', 'directory', - 'revision', 'release', 'snapshot', 'alias'`) - Returns: - dict: None if the snapshot does not exist; - a dict with three keys otherwise: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than - `branches_count` branches after `branches_from` included. - """ res = self._snapshots.get(snapshot_id) if res is None: return None @@ -1125,27 +606,9 @@ } def snapshot_get_random(self): - """Finds a random snapshot id. - - Returns: - a sha1_git - """ return random.choice(list(self._snapshots)) def object_find_by_sha1_git(self, ids): - """Return the objects found with the given ids. - - Args: - ids: a generator of sha1_gits - - Returns: - dict: a mapping from id to the list of objects found. Each object - found is itself a dict with keys: - - - sha1_git: the input id - - type: the type of object found - - """ ret = {} for id_ in ids: objs = self._objects.get(id_, []) @@ -1162,30 +625,6 @@ return t.to_dict() def origin_get(self, origins): - """Return origins, either all identified by their ids or all - identified by urls. - - Args: - origin: a list of dictionaries representing the individual - origins to find. - These dicts have either the key url (and optionally type): - - - url (bytes): the url the origin points to - - or the id: - - - id (int): the origin's identifier - - Returns: - dict: the origin dictionary with the keys: - - - id: origin's id - - url: origin's url - - Raises: - ValueError: if the keys does not match (url and type) nor id. - - """ if isinstance(origins, dict): # Old API return_single = True @@ -1222,36 +661,12 @@ return results def origin_get_by_sha1(self, sha1s): - """Return origins, identified by the sha1 of their URLs. - - Args: - sha1s (list[bytes]): a list of sha1s - - Yields: - dicts containing origin information as returned - by :meth:`swh.storage.in_memory.Storage.origin_get`, or None if an - origin matching the sha1 is not found. - """ return [ self._convert_origin(self._origins_by_sha1.get(sha1)) for sha1 in sha1s ] def origin_get_range(self, origin_from=1, origin_count=100): - """Retrieve ``origin_count`` origins whose ids are greater - or equal than ``origin_from``. - - Origins are sorted by id before retrieving them. - - Args: - origin_from (int): the minimum id of origins to retrieve - origin_count (int): the maximum number of origins to retrieve - - Yields: - dicts containing origin information as returned - by :meth:`swh.storage.in_memory.Storage.origin_get`, plus - an 'id' key. - """ origin_from = max(origin_from, 1) if origin_from <= len(self._origins_by_id): max_idx = origin_from + origin_count - 1 @@ -1264,20 +679,6 @@ def origin_list(self, page_token: Optional[str] = None, limit: int = 100 ) -> dict: - """Returns the list of origins - - Args: - page_token: opaque token used for pagination. - limit: the maximum number of results to return - - Returns: - dict: dict with the following keys: - - **next_page_token** (str, optional): opaque token to be used as - `page_token` for retrieving the next page. if absent, there is - no more pages to gather. - - **origins** (List[dict]): list of origins, as returned by - `origin_get`. - """ origin_urls = sorted(self._origins) if page_token: from_ = bisect.bisect_left(origin_urls, page_token) @@ -1296,23 +697,6 @@ def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False): - """Search for origins whose urls contain a provided string pattern - or match a provided regular expression. - The search is performed in a case insensitive way. - - Args: - url_pattern (str): the string pattern to search for in origin urls - offset (int): number of found origins to skip before returning - results - limit (int): the maximum number of found origins to return - regexp (bool): if True, consider the provided pattern as a regular - expression and return origins whose urls match it - with_visit (bool): if True, filter out origins with no visit - - Returns: - An iterable of dict containing origin information as returned - by :meth:`swh.storage.storage.Storage.origin_get`. - """ origins = map(self._convert_origin, self._origins.values()) if regexp: pat = re.compile(url_pattern) @@ -1331,56 +715,17 @@ return origins[offset:offset+limit] def origin_count(self, url_pattern, regexp=False, with_visit=False): - """Count origins whose urls contain a provided string pattern - or match a provided regular expression. - The pattern search in origin urls is performed in a case insensitive - way. - - Args: - url_pattern (str): the string pattern to search for in origin urls - regexp (bool): if True, consider the provided pattern as a regular - expression and return origins whose urls match it - with_visit (bool): if True, filter out origins with no visit - - Returns: - int: The number of origins matching the search criterion. - """ return len(self.origin_search(url_pattern, regexp=regexp, with_visit=with_visit, limit=len(self._origins))) def origin_add(self, origins): - """Add origins to the storage - - Args: - origins: list of dictionaries representing the individual origins, - with the following keys: - - - url (bytes): the url the origin points to - - Returns: - list: given origins as dict updated with their id - - """ origins = copy.deepcopy(list(origins)) for origin in origins: self.origin_add_one(origin) return origins def origin_add_one(self, origin): - """Add origin to the storage - - Args: - origin: dictionary representing the individual origin to add. This - dict has the following keys: - - - url (bytes): the url the origin points to - - Returns: - the id of the added origin, or of the identical one that already - exists. - - """ origin = Origin.from_dict(origin) if origin.url not in self._origins: if self.journal_writer: @@ -1400,20 +745,6 @@ return origin.url def origin_visit_add(self, origin, date, type): - """Add an origin_visit for the origin at date with status 'ongoing'. - - Args: - origin (str): visited origin's identifier or URL - date (Union[str,datetime]): timestamp of such visit - type (str): the type of loader used for the visit (hg, git, ...) - - Returns: - dict: dictionary with keys origin and visit where: - - - origin: origin's identifier - - visit: the visit's identifier for the new visit occurrence - - """ origin_url = origin if origin_url is None: raise ValueError('Unknown origin.') @@ -1455,20 +786,6 @@ def origin_visit_update(self, origin, visit_id, status=None, metadata=None, snapshot=None): - """Update an origin_visit's status. - - Args: - origin (str): visited origin's URL - visit_id (int): visit's identifier - status: visit's new status - metadata: data associated to the visit - snapshot (sha1_git): identifier of the snapshot to add to - the visit - - Returns: - None - - """ if not isinstance(origin, str): raise TypeError('origin must be a string, not %r' % (origin,)) origin_url = self._get_origin_url(origin) @@ -1497,22 +814,6 @@ self._origin_visits[origin_url][visit_id-1] = visit def origin_visit_upsert(self, visits): - """Add a origin_visits with a specific id and with all its data. - If there is already an origin_visit with the same - `(origin_url, visit_id)`, updates it instead of inserting a new one. - - Args: - visits: iterable of dicts with keys: - - - **origin**: origin url - - **visit**: origin visit id - - **type**: type of loader used for the visit - - **date**: timestamp of such visit - - **status**: Visit's new status - - **metadata**: Data associated to the visit - - **snapshot**: identifier of the snapshot to add to - the visit - """ for visit in visits: if not isinstance(visit['origin'], str): raise TypeError("visit['origin'] must be a string, not %r" @@ -1546,19 +847,6 @@ return visit def origin_visit_get(self, origin, last_visit=None, limit=None): - """Retrieve all the origin's visit's information. - - Args: - origin (int): the origin's identifier - last_visit (int): visit's id from which listing the next ones, - default to None - limit (int): maximum number of results to return, - default to None - - Yields: - List of visits. - - """ origin_url = self._get_origin_url(origin) if origin_url in self._origin_visits: visits = self._origin_visits[origin_url] @@ -1575,18 +863,6 @@ self._origin_visits[origin_url][visit_id-1]) def origin_visit_find_by_date(self, origin, visit_date): - """Retrieves the origin visit whose date is closest to the provided - timestamp. - In case of a tie, the visit with largest id is selected. - - Args: - origin (str): The occurrence's origin (URL). - target (datetime): target timestamp - - Returns: - A visit. - - """ origin_url = self._get_origin_url(origin) if origin_url in self._origin_visits: visits = self._origin_visits[origin_url] @@ -1596,16 +872,6 @@ return self._convert_visit(visit) def origin_visit_get_by(self, origin, visit): - """Retrieve origin visit's information. - - Args: - origin (int): the origin's identifier - - Returns: - The information on that particular (origin, visit) or None if - it does not exist - - """ origin_url = self._get_origin_url(origin) if origin_url in self._origin_visits and \ visit <= len(self._origin_visits[origin_url]): @@ -1614,31 +880,6 @@ def origin_visit_get_latest( self, origin, allowed_statuses=None, require_snapshot=False): - """Get the latest origin visit for the given origin, optionally - looking only for those with one of the given allowed_statuses - or for those with a known snapshot. - - Args: - origin (str): the origin's URL - allowed_statuses (list of str): list of visit statuses considered - to find the latest visit. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. - require_snapshot (bool): If True, only a visit with a snapshot - will be returned. - - Returns: - dict: a dict with the following keys: - - - **origin**: the URL of the origin - - **visit**: origin visit id - - **type**: type of loader used for the visit - - **date**: timestamp of such visit - - **status**: Visit's new status - - **metadata**: Data associated to the visit - - **snapshot** (Optional[sha1_git]): identifier of the snapshot - associated to the visit - """ origin = self._origins.get(origin) if not origin: return @@ -1655,7 +896,6 @@ return self._convert_visit(visit) def _select_random_origin_visit_by_type(self, type: str) -> str: - """Select randomly an origin visit """ while True: url = random.choice(list(self._origin_visits.keys())) random_origin_visits = self._origin_visits[url] @@ -1663,14 +903,6 @@ return url def origin_visit_get_random(self, type: str) -> Optional[Dict[str, Any]]: - """Randomly select one successful origin visit with - made in the last 3 months. - - Returns: - dict representing an origin visit, in the same format as - `origin_visit_get`. - - """ url = self._select_random_origin_visit_by_type(type) random_origin_visits = copy.deepcopy(self._origin_visits[url]) random_origin_visits.reverse() @@ -1683,13 +915,6 @@ return None def stat_counters(self): - """compute statistics about the number of tuples in various tables - - Returns: - dict: a dictionary mapping textual labels (e.g., content) to - integer values (e.g., the number of tuples in table content) - - """ keys = ( 'content', 'directory', @@ -1709,20 +934,9 @@ return stats def refresh_stat_counters(self): - """Recomputes the statistics for `stat_counters`.""" pass def origin_metadata_add(self, origin_url, ts, provider, tool, metadata): - """ Add an origin_metadata for the origin at ts with provenance and - metadata. - - Args: - origin_url (str): the origin url for which the metadata is added - ts (datetime): timestamp of the found metadata - provider: id of the provider of metadata (ex:'hal') - tool: id of the tool used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location - """ if not isinstance(origin_url, str): raise TypeError('origin_id must be str, not %r' % (origin_url,)) @@ -1740,25 +954,6 @@ return None def origin_metadata_get_by(self, origin_url, provider_type=None): - """Retrieve list of all origin_metadata entries for the origin_url - - Args: - origin_url (str): the origin's url - provider_type (str): (optional) type of provider - - Returns: - list of dicts: the origin_metadata dictionary with the keys: - - - origin_url (int): origin's URL - - discovery_date (datetime): timestamp of discovery - - tool_id (int): metadata's extracting tool - - metadata (jsonb) - - provider_id (int): metadata's provider - - provider_name (str) - - provider_type (str) - - provider_url (str) - - """ if not isinstance(origin_url, str): raise TypeError('origin_url must be str, not %r' % (origin_url,)) metadata = [] @@ -1772,23 +967,6 @@ return metadata def tool_add(self, tools): - """Add new tools to the storage. - - Args: - tools (iterable of :class:`dict`): Tool information to add to - storage. Each tool is a :class:`dict` with the following keys: - - - name (:class:`str`): name of the tool - - version (:class:`str`): version of the tool - - configuration (:class:`dict`): configuration of the tool, - must be json-encodable - - Returns: - :class:`dict`: All the tools inserted in storage - (including the internal ``id``). The order of the list is not - guaranteed to match the order of the initial list. - - """ inserted = [] for tool in tools: key = self._tool_key(tool) @@ -1802,32 +980,10 @@ return inserted def tool_get(self, tool): - """Retrieve tool information. - - Args: - tool (dict): Tool information we want to retrieve from storage. - The dicts have the same keys as those used in :func:`tool_add`. - - Returns: - dict: The full tool information if it exists (``id`` included), - None otherwise. - - """ return self._tools.get(self._tool_key(tool)) def metadata_provider_add(self, provider_name, provider_type, provider_url, metadata): - """Add a metadata provider. - - Args: - provider_name (str): Its name - provider_type (str): Its type - provider_url (str): Its URL - metadata: JSON-encodable object - - Returns: - an identifier of the provider - """ provider = { 'provider_name': provider_name, 'provider_type': provider_type, @@ -1840,28 +996,9 @@ return key def metadata_provider_get(self, provider_id): - """Get a metadata provider - - Args: - provider_id: Its identifier, as given by `metadata_provider_add`. - - Returns: - dict: same as `metadata_provider_add`; - or None if it does not exist. - """ return self._metadata_providers.get(provider_id) def metadata_provider_get_by(self, provider): - """Get a metadata provider - - Args: - provider_name: Its name - provider_url: Its URL - - Returns: - dict: same as `metadata_provider_add`; - or None if it does not exist. - """ key = self._metadata_provider_key(provider) return self._metadata_providers.get(key) @@ -1872,14 +1009,6 @@ raise TypeError('origin must be a string.') def _person_add(self, person): - """Add a person in storage. - - Note: Private method, do not use outside of this class. - - Args: - person: dictionary with keys fullname, name and email. - - """ key = ('person', person.fullname) if key not in self._objects: person_id = len(self._persons) + 1 @@ -1912,3 +1041,12 @@ @staticmethod def _metadata_provider_key(provider): return '%r %r' % (provider['provider_name'], provider['provider_url']) + + def diff_directories(self, from_dir, to_dir, track_renaming=False): + raise NotImplementedError('InMemoryStorage.diff_directories') + + def diff_revisions(self, from_rev, to_rev, track_renaming=False): + raise NotImplementedError('InMemoryStorage.diff_revisions') + + def diff_revision(self, revision, track_renaming=False): + raise NotImplementedError('InMemoryStorage.diff_revision') diff --git a/swh/storage/interface.py b/swh/storage/interface.py new file mode 100644 --- /dev/null +++ b/swh/storage/interface.py @@ -0,0 +1,1170 @@ +# Copyright (C) 2015-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Any, Dict, List, Optional + +from swh.core.api import remote_api_endpoint + + +class StorageInterface: + @remote_api_endpoint('check_config') + def check_config(self, *, check_write): + """Check that the storage is configured and ready to go.""" + ... + + @remote_api_endpoint('content/add') + def content_add(self, content): + """Add content blobs to the storage + + Note: in case of DB errors, objects might have already been added to + the object storage and will not be removed. Since addition to the + object storage is idempotent, that should not be a problem. + + Args: + contents (iterable): iterable of dictionaries representing + individual pieces of content to add. Each dictionary has the + following keys: + + - data (bytes): the actual content + - length (int): content length (default: -1) + - one key for each checksum algorithm in + :data:`swh.model.hashutil.ALGORITHMS`, mapped to the + corresponding checksum + - status (str): one of visible, hidden, absent + - reason (str): if status = absent, the reason why + - origin (int): if status = absent, the origin we saw the + content in + + Raises: + + In case of errors, nothing is stored in the db (in the + objstorage, it could though). The following exceptions can + occur: + + - HashCollision in case of collision + - Any other exceptions raise by the db + + Returns: + Summary dict with the following key and associated values: + + content:add: New contents added + content:add:bytes: Sum of the contents' length data + skipped_content:add: New skipped contents (no data) added + """ + ... + + @remote_api_endpoint('content/update') + def content_update(self, content, keys=[]): + """Update content blobs to the storage. Does nothing for unknown + contents or skipped ones. + + Args: + content (iterable): iterable of dictionaries representing + individual pieces of content to update. Each dictionary has the + following keys: + + - data (bytes): the actual content + - length (int): content length (default: -1) + - one key for each checksum algorithm in + :data:`swh.model.hashutil.ALGORITHMS`, mapped to the + corresponding checksum + - status (str): one of visible, hidden, absent + + keys (list): List of keys (str) whose values needs an update, e.g., + new hash column + + """ + ... + + @remote_api_endpoint('content/add_metadata') + def content_add_metadata(self, content): + """Add content metadata to the storage (like `content_add`, but + without inserting to the objstorage). + + Args: + content (iterable): iterable of dictionaries representing + individual pieces of content to add. Each dictionary has the + following keys: + + - length (int): content length (default: -1) + - one key for each checksum algorithm in + :data:`swh.model.hashutil.ALGORITHMS`, mapped to the + corresponding checksum + - status (str): one of visible, hidden, absent + - reason (str): if status = absent, the reason why + - origin (int): if status = absent, the origin we saw the + content in + - ctime (datetime): time of insertion in the archive + + Returns: + Summary dict with the following key and associated values: + + content:add: New contents added + skipped_content:add: New skipped contents (no data) added + """ + ... + + @remote_api_endpoint('content/data') + def content_get(self, content): + """Retrieve in bulk contents and their data. + + This generator yields exactly as many items than provided sha1 + identifiers, but callers should not assume this will always be true. + + It may also yield `None` values in case an object was not found. + + Args: + content: iterables of sha1 + + Yields: + Dict[str, bytes]: Generates streams of contents as dict with their + raw data: + + - sha1 (bytes): content id + - data (bytes): content's raw data + + Raises: + ValueError in case of too much contents are required. + cf. BULK_BLOCK_CONTENT_LEN_MAX + + """ + ... + + @remote_api_endpoint('content/range') + def content_get_range(self, start, end, limit=1000): + """Retrieve contents within range [start, end] bound by limit. + + Note that this function may return more than one blob per hash. The + limit is enforced with multiplicity (ie. two blobs with the same hash + will count twice toward the limit). + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **limit** (int): Limit result (default to 1000) + + Returns: + a dict with keys: + - contents [dict]: iterable of contents in between the range. + - next (bytes): There remains content in the range + starting from this next sha1 + + """ + ... + + @remote_api_endpoint('content/partition') + def content_get_metadata( + self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: + """Retrieve content metadata in bulk + + Args: + content: iterable of content identifiers (sha1) + + Returns: + a dict with keys the content's sha1 and the associated value + either the existing content's metadata or None if the content does + not exist. + + """ + ... + + @remote_api_endpoint('content/missing') + def content_missing(self, content, key_hash='sha1'): + """List content missing from storage + + Args: + content ([dict]): iterable of dictionaries whose keys are + either 'length' or an item of + :data:`swh.model.hashutil.ALGORITHMS`; + mapped to the corresponding checksum + (or length). + + key_hash (str): name of the column to use as hash id + result (default: 'sha1') + + Returns: + iterable ([bytes]): missing content ids (as per the + key_hash column) + + Raises: + TODO: an exception when we get a hash collision. + + """ + ... + + @remote_api_endpoint('content/missing/sha1') + def content_missing_per_sha1(self, contents): + """List content missing from storage based only on sha1. + + Args: + contents: Iterable of sha1 to check for absence. + + Returns: + iterable: missing ids + + Raises: + TODO: an exception when we get a hash collision. + + """ + ... + + @remote_api_endpoint('content/missing/sha1_git') + def content_missing_per_sha1_git(self, contents): + """List content missing from storage based only on sha1_git. + + Args: + contents (Iterable): An iterable of content id (sha1_git) + + Yields: + missing contents sha1_git + """ + ... + + @remote_api_endpoint('content/skipped/missing') + def skipped_content_missing(self, contents): + """List skipped_content missing from storage + + Args: + content: iterable of dictionaries containing the data for each + checksum algorithm. + + Returns: + iterable: missing signatures + + """ + ... + + @remote_api_endpoint('content/present') + def content_find(self, content): + """Find a content hash in db. + + Args: + content: a dictionary representing one content hash, mapping + checksum algorithm names (see swh.model.hashutil.ALGORITHMS) to + checksum values + + Returns: + a triplet (sha1, sha1_git, sha256) if the content exist + or None otherwise. + + Raises: + ValueError: in case the key of the dictionary is not sha1, sha1_git + nor sha256. + + """ + ... + + @remote_api_endpoint('content/get_random') + def content_get_random(self): + """Finds a random content id. + + Returns: + a sha1_git + """ + ... + + @remote_api_endpoint('directory/add') + def directory_add(self, directories): + """Add directories to the storage + + Args: + directories (iterable): iterable of dictionaries representing the + individual directories to add. Each dict has the following + keys: + + - id (sha1_git): the id of the directory to add + - entries (list): list of dicts for each entry in the + directory. Each dict has the following keys: + + - name (bytes) + - type (one of 'file', 'dir', 'rev'): type of the + directory entry (file, directory, revision) + - target (sha1_git): id of the object pointed at by the + directory entry + - perms (int): entry permissions + + Returns: + Summary dict of keys with associated count as values: + + directory:add: Number of directories actually added + + """ + ... + + @remote_api_endpoint('directory/missing') + def directory_missing(self, directories): + """List directories missing from storage + + Args: + directories (iterable): an iterable of directory ids + + Yields: + missing directory ids + + """ + ... + + @remote_api_endpoint('directory/ls') + def directory_entry_get_by_path(self, directory, paths): + """Get the directory entry (either file or dir) from directory with path. + + Args: + - directory: sha1 of the top level directory + - paths: path to lookup from the top level directory. From left + (top) to right (bottom). + + Returns: + The corresponding directory entry if found, None otherwise. + + """ + ... + + @remote_api_endpoint('directory/get_random') + def directory_get_random(self): + """Finds a random directory id. + + Returns: + a sha1_git + """ + ... + + @remote_api_endpoint('revision/add') + def revision_add(self, revisions): + """Add revisions to the storage + + Args: + revisions (Iterable[dict]): iterable of dictionaries representing + the individual revisions to add. Each dict has the following + keys: + + - **id** (:class:`sha1_git`): id of the revision to add + - **date** (:class:`dict`): date the revision was written + - **committer_date** (:class:`dict`): date the revision got + added to the origin + - **type** (one of 'git', 'tar'): type of the + revision added + - **directory** (:class:`sha1_git`): the directory the + revision points at + - **message** (:class:`bytes`): the message associated with + the revision + - **author** (:class:`Dict[str, bytes]`): dictionary with + keys: name, fullname, email + - **committer** (:class:`Dict[str, bytes]`): dictionary with + keys: name, fullname, email + - **metadata** (:class:`jsonb`): extra information as + dictionary + - **synthetic** (:class:`bool`): revision's nature (tarball, + directory creates synthetic revision`) + - **parents** (:class:`list[sha1_git]`): the parents of + this revision + + date dictionaries have the form defined in :mod:`swh.model`. + + Returns: + Summary dict of keys with associated count as values + + revision:add: New objects actually stored in db + + """ + ... + + @remote_api_endpoint('revision/missing') + def revision_missing(self, revisions): + """List revisions missing from storage + + Args: + revisions (iterable): revision ids + + Yields: + missing revision ids + + """ + ... + + @remote_api_endpoint('revision') + def revision_get(self, revisions): + """Get all revisions from storage + + Args: + revisions: an iterable of revision ids + + Returns: + iterable: an iterable of revisions as dictionaries (or None if the + revision doesn't exist) + + """ + ... + + @remote_api_endpoint('revision/log') + def revision_log(self, revisions, limit=None): + """Fetch revision entry from the given root revisions. + + Args: + revisions: array of root revision to lookup + limit: limitation on the output result. Default to None. + + Yields: + List of revision log from such revisions root. + + """ + ... + + @remote_api_endpoint('revision/shortlog') + def revision_get_random(self): + """Finds a random revision id. + + Returns: + a sha1_git + """ + ... + + @remote_api_endpoint('release/add') + def release_add(self, releases): + """Add releases to the storage + + Args: + releases (Iterable[dict]): iterable of dictionaries representing + the individual releases to add. Each dict has the following + keys: + + - **id** (:class:`sha1_git`): id of the release to add + - **revision** (:class:`sha1_git`): id of the revision the + release points to + - **date** (:class:`dict`): the date the release was made + - **name** (:class:`bytes`): the name of the release + - **comment** (:class:`bytes`): the comment associated with + the release + - **author** (:class:`Dict[str, bytes]`): dictionary with + keys: name, fullname, email + + the date dictionary has the form defined in :mod:`swh.model`. + + Returns: + Summary dict of keys with associated count as values + + release:add: New objects contents actually stored in db + + """ + ... + + @remote_api_endpoint('release/missing') + def release_missing(self, releases): + """List releases missing from storage + + Args: + releases: an iterable of release ids + + Returns: + a list of missing release ids + + """ + ... + + @remote_api_endpoint('release') + def release_get(self, releases): + """Given a list of sha1, return the releases's information + + Args: + releases: list of sha1s + + Yields: + dicts with the same keys as those given to `release_add` + (or ``None`` if a release does not exist) + + """ + ... + + @remote_api_endpoint('release/get_random') + def release_get_random(self): + """Finds a random release id. + + Returns: + a sha1_git + """ + ... + + @remote_api_endpoint('snapshot/add') + def snapshot_add(self, snapshots): + """Add snapshots to the storage. + + Args: + snapshot ([dict]): the snapshots to add, containing the + following keys: + + - **id** (:class:`bytes`): id of the snapshot + - **branches** (:class:`dict`): branches the snapshot contains, + mapping the branch name (:class:`bytes`) to the branch target, + itself a :class:`dict` (or ``None`` if the branch points to an + unknown object) + + - **target_type** (:class:`str`): one of ``content``, + ``directory``, ``revision``, ``release``, + ``snapshot``, ``alias`` + - **target** (:class:`bytes`): identifier of the target + (currently a ``sha1_git`` for all object kinds, or the name + of the target branch for aliases) + + Raises: + ValueError: if the origin or visit id does not exist. + + Returns: + + Summary dict of keys with associated count as values + + snapshot:add: Count of object actually stored in db + + """ + ... + + @remote_api_endpoint('snapshot/missing') + def snapshot_missing(self, snapshots): + """List snapshots missing from storage + + Args: + snapshots (iterable): an iterable of snapshot ids + + Yields: + missing snapshot ids + + """ + ... + + @remote_api_endpoint('snapshot') + def snapshot_get(self, snapshot_id): + """Get the content, possibly partial, of a snapshot with the given id + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + .. warning:: At most 1000 branches contained in the snapshot will be + returned for performance reasons. In order to browse the whole + set of branches, the method :meth:`snapshot_get_branches` + should be used instead. + + Args: + snapshot_id (bytes): identifier of the snapshot + Returns: + dict: a dict with three keys: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than 1000 + branches. + """ + ... + + @remote_api_endpoint('snapshot/by_origin_visit') + def snapshot_get_by_origin_visit(self, origin, visit): + """Get the content, possibly partial, of a snapshot for the given origin visit + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + .. warning:: At most 1000 branches contained in the snapshot will be + returned for performance reasons. In order to browse the whole + set of branches, the method :meth:`snapshot_get_branches` + should be used instead. + + Args: + origin (int): the origin identifier + visit (int): the visit identifier + Returns: + dict: None if the snapshot does not exist; + a dict with three keys otherwise: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than 1000 + branches. + + """ + ... + + @remote_api_endpoint('snapshot/latest') + def snapshot_get_latest(self, origin, allowed_statuses=None): + """Get the content, possibly partial, of the latest snapshot for the + given origin, optionally only from visits that have one of the given + allowed_statuses + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + .. warning:: At most 1000 branches contained in the snapshot will be + returned for performance reasons. In order to browse the whole + set of branches, the method :meth:`snapshot_get_branches` + should be used instead. + + Args: + origin (str): the origin's URL + allowed_statuses (list of str): list of visit statuses considered + to find the latest snapshot for the visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + Returns: + dict: a dict with three keys: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than 1000 + branches. + """ + ... + + @remote_api_endpoint('snapshot/count_branches') + def snapshot_count_branches(self, snapshot_id): + """Count the number of branches in the snapshot with the given id + + Args: + snapshot_id (bytes): identifier of the snapshot + + Returns: + dict: A dict whose keys are the target types of branches and + values their corresponding amount + """ + ... + + @remote_api_endpoint('snapshot/get_branches') + def snapshot_get_branches(self, snapshot_id, branches_from=b'', + branches_count=1000, target_types=None): + """Get the content, possibly partial, of a snapshot with the given id + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + Args: + snapshot_id (bytes): identifier of the snapshot + branches_from (bytes): optional parameter used to skip branches + whose name is lesser than it before returning them + branches_count (int): optional parameter used to restrain + the amount of returned branches + target_types (list): optional parameter used to filter the + target types of branch to return (possible values that can be + contained in that list are `'content', 'directory', + 'revision', 'release', 'snapshot', 'alias'`) + Returns: + dict: None if the snapshot does not exist; + a dict with three keys otherwise: + * **id**: identifier of the snapshot + * **branches**: a dict of branches contained in the snapshot + whose keys are the branches' names. + * **next_branch**: the name of the first branch not returned + or :const:`None` if the snapshot has less than + `branches_count` branches after `branches_from` included. + """ + ... + + @remote_api_endpoint('snapshot/get_random') + def snapshot_get_random(self): + """Finds a random snapshot id. + + Returns: + a sha1_git + """ + ... + + @remote_api_endpoint('origin/visit/add') + def origin_visit_add(self, origin, date, type): + """Add an origin_visit for the origin at ts with status 'ongoing'. + + Args: + origin (str): visited origin's identifier or URL + date (Union[str,datetime]): timestamp of such visit + type (str): the type of loader used for the visit (hg, git, ...) + + Returns: + dict: dictionary with keys origin and visit where: + + - origin: origin identifier + - visit: the visit identifier for the new visit occurrence + + """ + ... + + @remote_api_endpoint('origin/visit/update') + def origin_visit_update(self, origin, visit_id, status=None, + metadata=None, snapshot=None): + """Update an origin_visit's status. + + Args: + origin (str): visited origin's URL + visit_id: Visit's id + status: Visit's new status + metadata: Data associated to the visit + snapshot (sha1_git): identifier of the snapshot to add to + the visit + + Returns: + None + + """ + ... + + @remote_api_endpoint('origin/visit/upsert') + def origin_visit_upsert(self, visits): + """Add a origin_visits with a specific id and with all its data. + If there is already an origin_visit with the same + `(origin_id, visit_id)`, overwrites it. + + Args: + visits: iterable of dicts with keys: + + - **origin**: dict with keys either `id` or `url` + - **visit**: origin visit id + - **date**: timestamp of such visit + - **status**: Visit's new status + - **metadata**: Data associated to the visit + - **snapshot**: identifier of the snapshot to add to + the visit + """ + ... + + @remote_api_endpoint('origin/visit/get') + def origin_visit_get(self, origin, last_visit=None, limit=None): + """Retrieve all the origin's visit's information. + + Args: + origin (str): The visited origin + last_visit: Starting point from which listing the next visits + Default to None + limit (int): Number of results to return from the last visit. + Default to None + + Yields: + List of visits. + + """ + ... + + @remote_api_endpoint('origin/visit/find_by_date') + def origin_visit_find_by_date(self, origin, visit_date): + """Retrieves the origin visit whose date is closest to the provided + timestamp. + In case of a tie, the visit with largest id is selected. + + Args: + origin (str): The occurrence's origin (URL). + target (datetime): target timestamp + + Returns: + A visit. + + """ + ... + + @remote_api_endpoint('origin/visit/getby') + def origin_visit_get_by(self, origin, visit): + """Retrieve origin visit's information. + + Args: + origin: The occurrence's origin (identifier). + + Returns: + The information on that particular (origin, visit) or None if + it does not exist + + """ + ... + + @remote_api_endpoint('origin/visit/get_latest') + def origin_visit_get_latest( + self, origin, allowed_statuses=None, require_snapshot=False): + """Get the latest origin visit for the given origin, optionally + looking only for those with one of the given allowed_statuses + or for those with a known snapshot. + + Args: + origin (str): the origin's URL + allowed_statuses (list of str): list of visit statuses considered + to find the latest visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + require_snapshot (bool): If True, only a visit with a snapshot + will be returned. + + Returns: + dict: a dict with the following keys: + + - **origin**: the URL of the origin + - **visit**: origin visit id + - **type**: type of loader used for the visit + - **date**: timestamp of such visit + - **status**: Visit's new status + - **metadata**: Data associated to the visit + - **snapshot** (Optional[sha1_git]): identifier of the snapshot + associated to the visit + """ + ... + + @remote_api_endpoint('origin/visit/get_random') + def origin_visit_get_random( + self, type: str) -> Optional[Dict[str, Any]]: + """Randomly select one successful origin visit with + made in the last 3 months. + + Returns: + dict representing an origin visit, in the same format as + :py:meth:`origin_visit_get`. + + """ + ... + + @remote_api_endpoint('object/find_by_sha1_git') + def object_find_by_sha1_git(self, ids): + """Return the objects found with the given ids. + + Args: + ids: a generator of sha1_gits + + Returns: + dict: a mapping from id to the list of objects found. Each object + found is itself a dict with keys: + + - sha1_git: the input id + - type: the type of object found + + """ + ... + + @remote_api_endpoint('origin/get') + def origin_get(self, origins): + """Return origins, either all identified by their ids or all + identified by tuples (type, url). + + If the url is given and the type is omitted, one of the origins with + that url is returned. + + Args: + origin: a list of dictionaries representing the individual + origins to find. + These dicts have the key url: + + - url (bytes): the url the origin points to + + Returns: + dict: the origin dictionary with the keys: + + - id: origin's id + - url: origin's url + + Raises: + ValueError: if the url or the id don't exist. + + """ + ... + + @remote_api_endpoint('origin/get_sha1') + def origin_get_by_sha1(self, sha1s): + """Return origins, identified by the sha1 of their URLs. + + Args: + sha1s (list[bytes]): a list of sha1s + + Yields: + dicts containing origin information as returned + by :meth:`swh.storage.storage.Storage.origin_get`, or None if an + origin matching the sha1 is not found. + + """ + ... + + @remote_api_endpoint('origin/get_range') + def origin_get_range(self, origin_from=1, origin_count=100): + """Retrieve ``origin_count`` origins whose ids are greater + or equal than ``origin_from``. + + Origins are sorted by id before retrieving them. + + Args: + origin_from (int): the minimum id of origins to retrieve + origin_count (int): the maximum number of origins to retrieve + + Yields: + dicts containing origin information as returned + by :meth:`swh.storage.storage.Storage.origin_get`. + """ + ... + + @remote_api_endpoint('origin/list') + def origin_list( + self, page_token: Optional[str] = None, limit: int = 100) -> dict: + """Returns the list of origins + + Args: + page_token: opaque token used for pagination. + limit: the maximum number of results to return + + Returns: + dict: dict with the following keys: + - **next_page_token** (str, optional): opaque token to be used as + `page_token` for retrieving the next page. if absent, there is + no more pages to gather. + - **origins** (List[dict]): list of origins, as returned by + `origin_get`. + """ + ... + + @remote_api_endpoint('origin/search') + def origin_search(self, url_pattern, offset=0, limit=50, + regexp=False, with_visit=False): + """Search for origins whose urls contain a provided string pattern + or match a provided regular expression. + The search is performed in a case insensitive way. + + Args: + url_pattern (str): the string pattern to search for in origin urls + offset (int): number of found origins to skip before returning + results + limit (int): the maximum number of found origins to return + regexp (bool): if True, consider the provided pattern as a regular + expression and return origins whose urls match it + with_visit (bool): if True, filter out origins with no visit + + Yields: + dicts containing origin information as returned + by :meth:`swh.storage.storage.Storage.origin_get`. + """ + ... + + @remote_api_endpoint('origin/count') + def origin_count(self, url_pattern, regexp=False, + with_visit=False): + """Count origins whose urls contain a provided string pattern + or match a provided regular expression. + The pattern search in origin urls is performed in a case insensitive + way. + + Args: + url_pattern (str): the string pattern to search for in origin urls + regexp (bool): if True, consider the provided pattern as a regular + expression and return origins whose urls match it + with_visit (bool): if True, filter out origins with no visit + + Returns: + int: The number of origins matching the search criterion. + """ + ... + + @remote_api_endpoint('origin/add_multi') + def origin_add(self, origins): + """Add origins to the storage + + Args: + origins: list of dictionaries representing the individual origins, + with the following keys: + + - type: the origin type ('git', 'svn', 'deb', ...) + - url (bytes): the url the origin points to + + Returns: + list: given origins as dict updated with their id + + """ + ... + + @remote_api_endpoint('origin/add') + def origin_add_one(self, origin): + """Add origin to the storage + + Args: + origin: dictionary representing the individual origin to add. This + dict has the following keys: + + - type (FIXME: enum TBD): the origin type ('git', 'wget', ...) + - url (bytes): the url the origin points to + + Returns: + the id of the added origin, or of the identical one that already + exists. + + """ + ... + + def stat_counters(self): + """compute statistics about the number of tuples in various tables + + Returns: + dict: a dictionary mapping textual labels (e.g., content) to + integer values (e.g., the number of tuples in table content) + + """ + ... + + def refresh_stat_counters(self): + """Recomputes the statistics for `stat_counters`.""" + ... + + @remote_api_endpoint('origin/metadata/add') + def origin_metadata_add(self, origin_url, ts, provider, tool, metadata): + """ Add an origin_metadata for the origin at ts with provenance and + metadata. + + Args: + origin_url (str): the origin url for which the metadata is added + ts (datetime): timestamp of the found metadata + provider (int): the provider of metadata (ex:'hal') + tool (int): tool used to extract metadata + metadata (jsonb): the metadata retrieved at the time and location + """ + ... + + @remote_api_endpoint('origin/metadata/get') + def origin_metadata_get_by(self, origin_url, provider_type=None): + """Retrieve list of all origin_metadata entries for the origin_id + + Args: + origin_url (str): the origin's URL + provider_type (str): (optional) type of provider + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - origin_id (int): origin's id + - discovery_date (datetime): timestamp of discovery + - tool_id (int): metadata's extracting tool + - metadata (jsonb) + - provider_id (int): metadata's provider + - provider_name (str) + - provider_type (str) + - provider_url (str) + + """ + ... + + @remote_api_endpoint('tool/add') + def tool_add(self, tools): + """Add new tools to the storage. + + Args: + tools (iterable of :class:`dict`): Tool information to add to + storage. Each tool is a :class:`dict` with the following keys: + + - name (:class:`str`): name of the tool + - version (:class:`str`): version of the tool + - configuration (:class:`dict`): configuration of the tool, + must be json-encodable + + Returns: + :class:`dict`: All the tools inserted in storage + (including the internal ``id``). The order of the list is not + guaranteed to match the order of the initial list. + + """ + ... + + @remote_api_endpoint('tool/data') + def tool_get(self, tool): + """Retrieve tool information. + + Args: + tool (dict): Tool information we want to retrieve from storage. + The dicts have the same keys as those used in :func:`tool_add`. + + Returns: + dict: The full tool information if it exists (``id`` included), + None otherwise. + + """ + ... + + @remote_api_endpoint('provider/add') + def metadata_provider_add(self, provider_name, provider_type, provider_url, + metadata): + """Add a metadata provider. + + Args: + provider_name (str): Its name + provider_type (str): Its type (eg. `'deposit-client'`) + provider_url (str): Its URL + metadata: JSON-encodable object + + Returns: + int: an identifier of the provider + """ + ... + + @remote_api_endpoint('provider/get') + def metadata_provider_get(self, provider_id): + """Get a metadata provider + + Args: + provider_id: Its identifier, as given by `metadata_provider_add`. + + Returns: + dict: same as `metadata_provider_add`; + or None if it does not exist. + """ + ... + + @remote_api_endpoint('provider/getby') + def metadata_provider_get_by(self, provider): + """Get a metadata provider + + Args: + provider (dict): A dictionary with keys: + * provider_name: Its name + * provider_url: Its URL + + Returns: + dict: same as `metadata_provider_add`; + or None if it does not exist. + """ + ... + + @remote_api_endpoint('algos/diff_directories') + def diff_directories(self, from_dir, to_dir, track_renaming=False): + """Compute the list of file changes introduced between two arbitrary + directories (insertion / deletion / modification / renaming of files). + + Args: + from_dir (bytes): identifier of the directory to compare from + to_dir (bytes): identifier of the directory to compare to + track_renaming (bool): whether or not to track files renaming + + Returns: + A list of dict describing the introduced file changes + (see :func:`swh.storage.algos.diff.diff_directories` + for more details). + """ + ... + + @remote_api_endpoint('algos/diff_revisions') + def diff_revisions(self, from_rev, to_rev, track_renaming=False): + """Compute the list of file changes introduced between two arbitrary + revisions (insertion / deletion / modification / renaming of files). + + Args: + from_rev (bytes): identifier of the revision to compare from + to_rev (bytes): identifier of the revision to compare to + track_renaming (bool): whether or not to track files renaming + + Returns: + A list of dict describing the introduced file changes + (see :func:`swh.storage.algos.diff.diff_directories` + for more details). + """ + ... + + @remote_api_endpoint('algos/diff_revision') + def diff_revision(self, revision, track_renaming=False): + """Compute the list of file changes introduced by a specific revision + (insertion / deletion / modification / renaming of files) by comparing + it against its first parent. + + Args: + revision (bytes): identifier of the revision from which to + compute the list of files changes + track_renaming (bool): whether or not to track files renaming + + Returns: + A list of dict describing the introduced file changes + (see :func:`swh.storage.algos.diff.diff_directories` + for more details). + """ + ... diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -17,7 +17,6 @@ import psycopg2 import psycopg2.pool -from swh.core.api import remote_api_endpoint from swh.model.model import SHA1_SIZE from swh.model.hashutil import ALGORITHMS, hash_to_bytes, hash_to_hex from swh.objstorage import get_objstorage @@ -99,11 +98,9 @@ if db: self.put_db(db) - @remote_api_endpoint('check_config') @timed @db_transaction() def check_config(self, *, check_write, db=None, cur=None): - """Check that the storage is configured and ready to go.""" if not self.objstorage.check_config(check_write=check_write): return False @@ -232,48 +229,10 @@ # move metadata in place db.skipped_content_add_from_temp(cur) - @remote_api_endpoint('content/add') @timed @process_metrics @db_transaction() def content_add(self, content, db=None, cur=None): - """Add content blobs to the storage - - Note: in case of DB errors, objects might have already been added to - the object storage and will not be removed. Since addition to the - object storage is idempotent, that should not be a problem. - - Args: - contents (iterable): iterable of dictionaries representing - individual pieces of content to add. Each dictionary has the - following keys: - - - data (bytes): the actual content - - length (int): content length (default: -1) - - one key for each checksum algorithm in - :data:`swh.model.hashutil.ALGORITHMS`, mapped to the - corresponding checksum - - status (str): one of visible, hidden, absent - - reason (str): if status = absent, the reason why - - origin (int): if status = absent, the origin we saw the - content in - - Raises: - - In case of errors, nothing is stored in the db (in the - objstorage, it could though). The following exceptions can - occur: - - - HashCollision in case of collision - - Any other exceptions raise by the db - - Returns: - Summary dict with the following key and associated values: - - content:add: New contents added - content:add:bytes: Sum of the contents' length data - skipped_content:add: New skipped contents (no data) added - """ content = [dict(c.items()) for c in content] # semi-shallow copy now = datetime.datetime.now(tz=datetime.timezone.utc) for item in content: @@ -329,35 +288,15 @@ summary['content:add:bytes'] = content_bytes_added return summary - @remote_api_endpoint('content/update') @timed @db_transaction() def content_update(self, content, keys=[], db=None, cur=None): - """Update content blobs to the storage. Does nothing for unknown - contents or skipped ones. - - Args: - content (iterable): iterable of dictionaries representing - individual pieces of content to update. Each dictionary has the - following keys: - - - data (bytes): the actual content - - length (int): content length (default: -1) - - one key for each checksum algorithm in - :data:`swh.model.hashutil.ALGORITHMS`, mapped to the - corresponding checksum - - status (str): one of visible, hidden, absent - - keys (list): List of keys (str) whose values needs an update, e.g., - new hash column - - """ # TODO: Add a check on input keys. How to properly implement # this? We don't know yet the new columns. if self.journal_writer: raise NotImplementedError( - 'content_update is not yet support with a journal_writer.') + 'content_update is not yet supported with a journal_writer.') db.mktemp('content', cur) select_keys = list(set(db.content_get_metadata_keys).union(set(keys))) @@ -365,36 +304,10 @@ db.content_update_from_temp(keys_to_update=keys, cur=cur) - @remote_api_endpoint('content/add_metadata') @timed @process_metrics @db_transaction() def content_add_metadata(self, content, db=None, cur=None): - """Add content metadata to the storage (like `content_add`, but - without inserting to the objstorage). - - Args: - content (iterable): iterable of dictionaries representing - individual pieces of content to add. Each dictionary has the - following keys: - - - length (int): content length (default: -1) - - one key for each checksum algorithm in - :data:`swh.model.hashutil.ALGORITHMS`, mapped to the - corresponding checksum - - status (str): one of visible, hidden, absent - - reason (str): if status = absent, the reason why - - origin (int): if status = absent, the origin we saw the - content in - - ctime (datetime): time of insertion in the archive - - Returns: - Summary dict with the following key and associated values: - - content:add: New contents added - skipped_content:add: New skipped contents (no data) added - """ - content = [self._normalize_content(c) for c in content] for c in content: self._validate_content(c) @@ -413,31 +326,8 @@ return summary - @remote_api_endpoint('content/data') @timed def content_get(self, content): - """Retrieve in bulk contents and their data. - - This generator yields exactly as many items than provided sha1 - identifiers, but callers should not assume this will always be true. - - It may also yield `None` values in case an object was not found. - - Args: - content: iterables of sha1 - - Yields: - Dict[str, bytes]: Generates streams of contents as dict with their - raw data: - - - sha1 (bytes): content id - - data (bytes): content's raw data - - Raises: - ValueError in case of too much contents are required. - cf. BULK_BLOCK_CONTENT_LEN_MAX - - """ # FIXME: Make this method support slicing the `data`. if len(content) > BULK_BLOCK_CONTENT_LEN_MAX: raise ValueError( @@ -452,30 +342,9 @@ yield {'sha1': obj_id, 'data': data} - @remote_api_endpoint('content/range') @timed @db_transaction() def content_get_range(self, start, end, limit=1000, db=None, cur=None): - """Retrieve contents within range [start, end] bound by limit. - - Note that this function may return more than one blob per hash. The - limit is enforced with multiplicity (ie. two blobs with the same hash - will count twice toward the limit). - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **limit** (int): Limit result (default to 1000) - - Returns: - a dict with keys: - - contents [dict]: iterable of contents in between the range. - - next (bytes): There remains content in the range - starting from this next sha1 - - """ if limit is None: raise ValueError('Development error: limit should not be None') contents = [] @@ -493,31 +362,11 @@ 'next': next_content, } - @remote_api_endpoint('content/partition') @timed @db_transaction() def content_get_partition( self, partition_id: int, nb_partitions: int, limit: int = 1000, page_token: str = None, db=None, cur=None): - """Splits contents into nb_partitions, and returns one of these based on - partition_id (which must be in [0, nb_partitions-1]) - - There is no guarantee on how the partitioning is done, or the - result order. - - Args: - partition_id (int): index of the partition to fetch - nb_partitions (int): total number of partitions to split into - limit (int): Limit result (default to 1000) - page_token (Optional[str]): opaque token used for pagination. - - Returns: - a dict with keys: - - contents (List[dict]): iterable of contents in the partition. - - **next_page_token** (Optional[str]): opaque token to be used as - `page_token` for retrieving the next page. if absent, there is - no more pages to gather. - """ if limit is None: raise ValueError('Development error: limit should not be None') (start, end) = get_partition_bounds_bytes( @@ -535,53 +384,20 @@ result2['next_page_token'] = hash_to_hex(result['next']) return result2 - @remote_api_endpoint('content/metadata') @timed @db_transaction(statement_timeout=500) def content_get_metadata( self, contents: List[bytes], db=None, cur=None) -> Dict[bytes, List[Dict]]: - """Retrieve content metadata in bulk - - Args: - content: iterable of content identifiers (sha1) - - Returns: - a dict with keys the content's sha1 and the associated value - either the existing content's metadata or None if the content does - not exist. - - """ result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents} for row in db.content_get_metadata_from_sha1s(contents, cur): content_meta = dict(zip(db.content_get_metadata_keys, row)) result[content_meta['sha1']].append(content_meta) return result - @remote_api_endpoint('content/missing') @timed @db_transaction_generator() def content_missing(self, content, key_hash='sha1', db=None, cur=None): - """List content missing from storage - - Args: - content ([dict]): iterable of dictionaries whose keys are - either 'length' or an item of - :data:`swh.model.hashutil.ALGORITHMS`; - mapped to the corresponding checksum - (or length). - - key_hash (str): name of the column to use as hash id - result (default: 'sha1') - - Returns: - iterable ([bytes]): missing content ids (as per the - key_hash column) - - Raises: - TODO: an exception when we get a hash collision. - - """ keys = db.content_hash_keys if key_hash not in keys: @@ -595,77 +411,27 @@ for obj in db.content_missing_from_list(content, cur): yield obj[key_hash_idx] - @remote_api_endpoint('content/missing/sha1') @timed @db_transaction_generator() def content_missing_per_sha1(self, contents, db=None, cur=None): - """List content missing from storage based only on sha1. - - Args: - contents: Iterable of sha1 to check for absence. - - Returns: - iterable: missing ids - - Raises: - TODO: an exception when we get a hash collision. - - """ for obj in db.content_missing_per_sha1(contents, cur): yield obj[0] - @remote_api_endpoint('content/missing/sha1_git') @timed @db_transaction_generator() def content_missing_per_sha1_git(self, contents, db=None, cur=None): - """List content missing from storage based only on sha1_git. - - Args: - contents (Iterable): An iterable of content id (sha1_git) - - Yields: - missing contents sha1_git - """ for obj in db.content_missing_per_sha1_git(contents, cur): yield obj[0] - @remote_api_endpoint('content/skipped/missing') @timed @db_transaction_generator() def skipped_content_missing(self, contents, db=None, cur=None): - """List skipped_content missing from storage - - Args: - content: iterable of dictionaries containing the data for each - checksum algorithm. - - Returns: - iterable: missing signatures - - """ for content in db.skipped_content_missing(contents, cur): yield dict(zip(db.content_hash_keys, content)) - @remote_api_endpoint('content/present') @timed @db_transaction() def content_find(self, content, db=None, cur=None): - """Find a content hash in db. - - Args: - content: a dictionary representing one content hash, mapping - checksum algorithm names (see swh.model.hashutil.ALGORITHMS) to - checksum values - - Returns: - a triplet (sha1, sha1_git, sha256) if the content exist - or None otherwise. - - Raises: - ValueError: in case the key of the dictionary is not sha1, sha1_git - nor sha256. - - """ if not set(content).intersection(ALGORITHMS): raise ValueError('content keys must contain at least one of: ' 'sha1, sha1_git, sha256, blake2s256') @@ -678,46 +444,15 @@ return [dict(zip(db.content_find_cols, content)) for content in contents] - @remote_api_endpoint('content/get_random') @timed @db_transaction() def content_get_random(self, db=None, cur=None): - """Finds a random content id. - - Returns: - a sha1_git - """ return db.content_get_random(cur) - @remote_api_endpoint('directory/add') @timed @process_metrics @db_transaction() def directory_add(self, directories, db=None, cur=None): - """Add directories to the storage - - Args: - directories (iterable): iterable of dictionaries representing the - individual directories to add. Each dict has the following - keys: - - - id (sha1_git): the id of the directory to add - - entries (list): list of dicts for each entry in the - directory. Each dict has the following keys: - - - name (bytes) - - type (one of 'file', 'dir', 'rev'): type of the - directory entry (file, directory, revision) - - target (sha1_git): id of the object pointed at by the - directory entry - - perms (int): entry permissions - - Returns: - Summary dict of keys with associated count as values: - - directory:add: Number of directories actually added - - """ directories = list(directories) summary = {'directory:add': 0} @@ -778,39 +513,15 @@ return summary - @remote_api_endpoint('directory/missing') @timed @db_transaction_generator() def directory_missing(self, directories, db=None, cur=None): - """List directories missing from storage - - Args: - directories (iterable): an iterable of directory ids - - Yields: - missing directory ids - - """ for obj in db.directory_missing_from_list(directories, cur): yield obj[0] - @remote_api_endpoint('directory/ls') @timed @db_transaction_generator(statement_timeout=20000) def directory_ls(self, directory, recursive=False, db=None, cur=None): - """Get entries for one directory. - - Args: - - directory: the directory to list entries from. - - recursive: if flag on, this list recursively from this directory. - - Returns: - List of entries for such directory. - - If `recursive=True`, names in the path of a dir/file not at the - root are concatenated with a slash (`/`). - - """ if recursive: res_gen = db.directory_walk(directory, cur=cur) else: @@ -819,77 +530,22 @@ for line in res_gen: yield dict(zip(db.directory_ls_cols, line)) - @remote_api_endpoint('directory/path') @timed @db_transaction(statement_timeout=2000) def directory_entry_get_by_path(self, directory, paths, db=None, cur=None): - """Get the directory entry (either file or dir) from directory with path. - - Args: - - directory: sha1 of the top level directory - - paths: path to lookup from the top level directory. From left - (top) to right (bottom). - - Returns: - The corresponding directory entry if found, None otherwise. - - """ res = db.directory_entry_get_by_path(directory, paths, cur) if res: return dict(zip(db.directory_ls_cols, res)) - @remote_api_endpoint('directory/get_random') @timed @db_transaction() def directory_get_random(self, db=None, cur=None): - """Finds a random directory id. - - Returns: - a sha1_git - """ return db.directory_get_random(cur) - @remote_api_endpoint('revision/add') @timed @process_metrics @db_transaction() def revision_add(self, revisions, db=None, cur=None): - """Add revisions to the storage - - Args: - revisions (Iterable[dict]): iterable of dictionaries representing - the individual revisions to add. Each dict has the following - keys: - - - **id** (:class:`sha1_git`): id of the revision to add - - **date** (:class:`dict`): date the revision was written - - **committer_date** (:class:`dict`): date the revision got - added to the origin - - **type** (one of 'git', 'tar'): type of the - revision added - - **directory** (:class:`sha1_git`): the directory the - revision points at - - **message** (:class:`bytes`): the message associated with - the revision - - **author** (:class:`Dict[str, bytes]`): dictionary with - keys: name, fullname, email - - **committer** (:class:`Dict[str, bytes]`): dictionary with - keys: name, fullname, email - - **metadata** (:class:`jsonb`): extra information as - dictionary - - **synthetic** (:class:`bool`): revision's nature (tarball, - directory creates synthetic revision`) - - **parents** (:class:`list[sha1_git]`): the parents of - this revision - - date dictionaries have the form defined in :mod:`swh.model`. - - Returns: - Summary dict of keys with associated count as values - - revision:add: New objects actually stored in db - - """ revisions = list(revisions) summary = {'revision:add': 0} @@ -925,39 +581,18 @@ return {'revision:add': len(revisions_missing)} - @remote_api_endpoint('revision/missing') @timed @db_transaction_generator() def revision_missing(self, revisions, db=None, cur=None): - """List revisions missing from storage - - Args: - revisions (iterable): revision ids - - Yields: - missing revision ids - - """ if not revisions: return for obj in db.revision_missing_from_list(revisions, cur): yield obj[0] - @remote_api_endpoint('revision') @timed @db_transaction_generator(statement_timeout=1000) def revision_get(self, revisions, db=None, cur=None): - """Get all revisions from storage - - Args: - revisions: an iterable of revision ids - - Returns: - iterable: an iterable of revisions as dictionaries (or None if the - revision doesn't exist) - - """ for line in db.revision_get_from_list(revisions, cur): data = converters.db_to_revision( dict(zip(db.revision_get_cols, line)) @@ -967,20 +602,9 @@ continue yield data - @remote_api_endpoint('revision/log') @timed @db_transaction_generator(statement_timeout=2000) def revision_log(self, revisions, limit=None, db=None, cur=None): - """Fetch revision entry from the given root revisions. - - Args: - revisions: array of root revision to lookup - limit: limitation on the output result. Default to None. - - Yields: - List of revision log from such revisions root. - - """ for line in db.revision_log(revisions, limit, cur): data = converters.db_to_revision( dict(zip(db.revision_get_cols, line)) @@ -990,64 +614,21 @@ continue yield data - @remote_api_endpoint('revision/shortlog') @timed @db_transaction_generator(statement_timeout=2000) def revision_shortlog(self, revisions, limit=None, db=None, cur=None): - """Fetch the shortlog for the given revisions - - Args: - revisions: list of root revisions to lookup - limit: depth limitation for the output - - Yields: - a list of (id, parents) tuples. - - """ yield from db.revision_shortlog(revisions, limit, cur) - @remote_api_endpoint('revision/get_random') @timed @db_transaction() def revision_get_random(self, db=None, cur=None): - """Finds a random revision id. - - Returns: - a sha1_git - """ return db.revision_get_random(cur) - @remote_api_endpoint('release/add') @timed @process_metrics @db_transaction() def release_add(self, releases, db=None, cur=None): - """Add releases to the storage - - Args: - releases (Iterable[dict]): iterable of dictionaries representing - the individual releases to add. Each dict has the following - keys: - - - **id** (:class:`sha1_git`): id of the release to add - - **revision** (:class:`sha1_git`): id of the revision the - release points to - - **date** (:class:`dict`): the date the release was made - - **name** (:class:`bytes`): the name of the release - - **comment** (:class:`bytes`): the comment associated with - the release - - **author** (:class:`Dict[str, bytes]`): dictionary with - keys: name, fullname, email - - the date dictionary has the form defined in :mod:`swh.model`. - - Returns: - Summary dict of keys with associated count as values - - release:add: New objects contents actually stored in db - - """ releases = list(releases) summary = {'release:add': 0} @@ -1079,90 +660,33 @@ return {'release:add': len(releases_missing)} - @remote_api_endpoint('release/missing') @timed @db_transaction_generator() def release_missing(self, releases, db=None, cur=None): - """List releases missing from storage - - Args: - releases: an iterable of release ids - - Returns: - a list of missing release ids - - """ if not releases: return for obj in db.release_missing_from_list(releases, cur): yield obj[0] - @remote_api_endpoint('release') @timed @db_transaction_generator(statement_timeout=500) def release_get(self, releases, db=None, cur=None): - """Given a list of sha1, return the releases's information - - Args: - releases: list of sha1s - - Yields: - dicts with the same keys as those given to `release_add` - (or ``None`` if a release does not exist) - - """ for release in db.release_get_from_list(releases, cur): data = converters.db_to_release( dict(zip(db.release_get_cols, release)) ) yield data if data['target_type'] else None - @remote_api_endpoint('release/get_random') @timed @db_transaction() def release_get_random(self, db=None, cur=None): - """Finds a random release id. - - Returns: - a sha1_git - """ return db.release_get_random(cur) - @remote_api_endpoint('snapshot/add') @timed @process_metrics @db_transaction() def snapshot_add(self, snapshots, db=None, cur=None): - """Add snapshots to the storage. - - Args: - snapshot ([dict]): the snapshots to add, containing the - following keys: - - - **id** (:class:`bytes`): id of the snapshot - - **branches** (:class:`dict`): branches the snapshot contains, - mapping the branch name (:class:`bytes`) to the branch target, - itself a :class:`dict` (or ``None`` if the branch points to an - unknown object) - - - **target_type** (:class:`str`): one of ``content``, - ``directory``, ``revision``, ``release``, - ``snapshot``, ``alias`` - - **target** (:class:`bytes`): identifier of the target - (currently a ``sha1_git`` for all object kinds, or the name - of the target branch for aliases) - - Raises: - ValueError: if the origin or visit id does not exist. - - Returns: - - Summary dict of keys with associated count as values - - snapshot:add: Count of object actually stored in db - - """ created_temp_table = False count = 0 @@ -1195,78 +719,21 @@ return {'snapshot:add': count} - @remote_api_endpoint('snapshot/missing') @timed @db_transaction_generator() def snapshot_missing(self, snapshots, db=None, cur=None): - """List snapshots missing from storage - - Args: - snapshots (iterable): an iterable of snapshot ids - - Yields: - missing snapshot ids - - """ for obj in db.snapshot_missing_from_list(snapshots, cur): yield obj[0] - @remote_api_endpoint('snapshot') @timed @db_transaction(statement_timeout=2000) def snapshot_get(self, snapshot_id, db=None, cur=None): - """Get the content, possibly partial, of a snapshot with the given id - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. - - Args: - snapshot_id (bytes): identifier of the snapshot - Returns: - dict: a dict with three keys: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - """ return self.snapshot_get_branches(snapshot_id, db=db, cur=cur) - @remote_api_endpoint('snapshot/by_origin_visit') @timed @db_transaction(statement_timeout=2000) def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None): - """Get the content, possibly partial, of a snapshot for the given origin visit - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. - - Args: - origin (int): the origin identifier - visit (int): the visit identifier - Returns: - dict: None if the snapshot does not exist; - a dict with three keys otherwise: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - - """ snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur) if snapshot_id: @@ -1274,38 +741,10 @@ return None - @remote_api_endpoint('snapshot/latest') @timed @db_transaction(statement_timeout=4000) def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, cur=None): - """Get the content, possibly partial, of the latest snapshot for the - given origin, optionally only from visits that have one of the given - allowed_statuses - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. - - Args: - origin (str): the origin's URL - allowed_statuses (list of str): list of visit statuses considered - to find the latest snapshot for the visit. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. - Returns: - dict: a dict with three keys: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - """ if isinstance(origin, int): origin = self.origin_get({'id': origin}, db=db, cur=cur) if not origin: @@ -1323,53 +762,17 @@ 'last origin visit references an unknown snapshot') return snapshot - @remote_api_endpoint('snapshot/count_branches') @timed @db_transaction(statement_timeout=2000) def snapshot_count_branches(self, snapshot_id, db=None, cur=None): - """Count the number of branches in the snapshot with the given id - - Args: - snapshot_id (bytes): identifier of the snapshot - - Returns: - dict: A dict whose keys are the target types of branches and - values their corresponding amount - """ return dict([bc for bc in db.snapshot_count_branches(snapshot_id, cur)]) - @remote_api_endpoint('snapshot/get_branches') @timed @db_transaction(statement_timeout=2000) def snapshot_get_branches(self, snapshot_id, branches_from=b'', branches_count=1000, target_types=None, db=None, cur=None): - """Get the content, possibly partial, of a snapshot with the given id - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - Args: - snapshot_id (bytes): identifier of the snapshot - branches_from (bytes): optional parameter used to skip branches - whose name is lesser than it before returning them - branches_count (int): optional parameter used to restrain - the amount of returned branches - target_types (list): optional parameter used to filter the - target types of branch to return (possible values that can be - contained in that list are `'content', 'directory', - 'revision', 'release', 'snapshot', 'alias'`) - Returns: - dict: None if the snapshot does not exist; - a dict with three keys otherwise: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than - `branches_count` branches after `branches_from` included. - """ if snapshot_id == EMPTY_SNAPSHOT_ID: return { 'id': snapshot_id, @@ -1406,36 +809,15 @@ return None - @remote_api_endpoint('snapshot/get_random') @timed @db_transaction() def snapshot_get_random(self, db=None, cur=None): - """Finds a random snapshot id. - - Returns: - a sha1_git - """ return db.snapshot_get_random(cur) - @remote_api_endpoint('origin/visit/add') @timed @db_transaction() def origin_visit_add(self, origin, date, type, db=None, cur=None): - """Add an origin_visit for the origin at ts with status 'ongoing'. - - Args: - origin (str): visited origin's identifier or URL - date (Union[str,datetime]): timestamp of such visit - type (str): the type of loader used for the visit (hg, git, ...) - - Returns: - dict: dictionary with keys origin and visit where: - - - origin: origin identifier - - visit: the visit identifier for the new visit occurrence - - """ origin_url = origin if isinstance(date, str): @@ -1458,26 +840,11 @@ 'visit': visit_id, } - @remote_api_endpoint('origin/visit/update') @timed @db_transaction() def origin_visit_update(self, origin, visit_id, status=None, metadata=None, snapshot=None, db=None, cur=None): - """Update an origin_visit's status. - - Args: - origin (str): visited origin's URL - visit_id: Visit's id - status: Visit's new status - metadata: Data associated to the visit - snapshot (sha1_git): identifier of the snapshot to add to - the visit - - Returns: - None - - """ if not isinstance(origin, str): raise TypeError('origin must be a string, not %r' % (origin,)) origin_url = origin @@ -1503,25 +870,9 @@ db.origin_visit_update(origin_url, visit_id, updates, cur) - @remote_api_endpoint('origin/visit/upsert') @timed @db_transaction() def origin_visit_upsert(self, visits, db=None, cur=None): - """Add a origin_visits with a specific id and with all its data. - If there is already an origin_visit with the same - `(origin_id, visit_id)`, overwrites it. - - Args: - visits: iterable of dicts with keys: - - - **origin**: dict with keys either `id` or `url` - - **visit**: origin visit id - - **date**: timestamp of such visit - - **status**: Visit's new status - - **metadata**: Data associated to the visit - - **snapshot**: identifier of the snapshot to add to - the visit - """ visits = copy.deepcopy(visits) for visit in visits: if isinstance(visit['date'], str): @@ -1538,142 +889,55 @@ # TODO: upsert them all in a single query db.origin_visit_upsert(**visit, cur=cur) - @remote_api_endpoint('origin/visit/get') @timed @db_transaction_generator(statement_timeout=500) def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, cur=None): - """Retrieve all the origin's visit's information. - - Args: - origin (str): The visited origin - last_visit: Starting point from which listing the next visits - Default to None - limit (int): Number of results to return from the last visit. - Default to None - - Yields: - List of visits. - - """ for line in db.origin_visit_get_all( origin, last_visit=last_visit, limit=limit, cur=cur): data = dict(zip(db.origin_visit_get_cols, line)) yield data - @remote_api_endpoint('origin/visit/find_by_date') @timed @db_transaction(statement_timeout=500) def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): - """Retrieves the origin visit whose date is closest to the provided - timestamp. - In case of a tie, the visit with largest id is selected. - - Args: - origin (str): The occurrence's origin (URL). - target (datetime): target timestamp - - Returns: - A visit. - - """ line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) if line: return dict(zip(db.origin_visit_get_cols, line)) - @remote_api_endpoint('origin/visit/getby') @timed @db_transaction(statement_timeout=500) def origin_visit_get_by(self, origin, visit, db=None, cur=None): - """Retrieve origin visit's information. - - Args: - origin: The occurrence's origin (identifier). - - Returns: - The information on that particular (origin, visit) or None if - it does not exist - - """ ori_visit = db.origin_visit_get(origin, visit, cur) if not ori_visit: return None return dict(zip(db.origin_visit_get_cols, ori_visit)) - @remote_api_endpoint('origin/visit/get_latest') @timed @db_transaction(statement_timeout=4000) def origin_visit_get_latest( self, origin, allowed_statuses=None, require_snapshot=False, db=None, cur=None): - """Get the latest origin visit for the given origin, optionally - looking only for those with one of the given allowed_statuses - or for those with a known snapshot. - - Args: - origin (str): the origin's URL - allowed_statuses (list of str): list of visit statuses considered - to find the latest visit. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. - require_snapshot (bool): If True, only a visit with a snapshot - will be returned. - - Returns: - dict: a dict with the following keys: - - - **origin**: the URL of the origin - - **visit**: origin visit id - - **type**: type of loader used for the visit - - **date**: timestamp of such visit - - **status**: Visit's new status - - **metadata**: Data associated to the visit - - **snapshot** (Optional[sha1_git]): identifier of the snapshot - associated to the visit - """ origin_visit = db.origin_visit_get_latest( origin, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, cur=cur) if origin_visit: return dict(zip(db.origin_visit_get_cols, origin_visit)) - @remote_api_endpoint('origin/visit/get_random') @timed @db_transaction() def origin_visit_get_random( self, type: str, db=None, cur=None) -> Optional[Dict[str, Any]]: - """Randomly select one successful origin visit with - made in the last 3 months. - - Returns: - dict representing an origin visit, in the same format as - :py:meth:`origin_visit_get`. - - """ result = db.origin_visit_get_random(type, cur) if result: return dict(zip(db.origin_visit_get_cols, result)) else: return None - @remote_api_endpoint('object/find_by_sha1_git') @timed @db_transaction(statement_timeout=2000) def object_find_by_sha1_git(self, ids, db=None, cur=None): - """Return the objects found with the given ids. - - Args: - ids: a generator of sha1_gits - - Returns: - dict: a mapping from id to the list of objects found. Each object - found is itself a dict with keys: - - - sha1_git: the input id - - type: the type of object found - - """ ret = {id: [] for id in ids} for retval in db.object_find_by_sha1_git(ids, cur=cur): @@ -1683,33 +947,9 @@ return ret - @remote_api_endpoint('origin/get') @timed @db_transaction(statement_timeout=500) def origin_get(self, origins, db=None, cur=None): - """Return origins, either all identified by their ids or all - identified by tuples (type, url). - - If the url is given and the type is omitted, one of the origins with - that url is returned. - - Args: - origin: a list of dictionaries representing the individual - origins to find. - These dicts have the key url: - - - url (bytes): the url the origin points to - - Returns: - dict: the origin dictionary with the keys: - - - id: origin's id - - url: origin's url - - Raises: - ValueError: if the url or the id don't exist. - - """ if isinstance(origins, dict): # Old API return_single = True @@ -1733,67 +973,26 @@ else: return [None if res['url'] is None else res for res in results] - @remote_api_endpoint('origin/get_sha1') @timed @db_transaction_generator(statement_timeout=500) def origin_get_by_sha1(self, sha1s, db=None, cur=None): - """Return origins, identified by the sha1 of their URLs. - - Args: - sha1s (list[bytes]): a list of sha1s - - Yields: - dicts containing origin information as returned - by :meth:`swh.storage.storage.Storage.origin_get`, or None if an - origin matching the sha1 is not found. - - """ for line in db.origin_get_by_sha1(sha1s, cur): if line[0] is not None: yield dict(zip(db.origin_cols, line)) else: yield None - @remote_api_endpoint('origin/get_range') @timed @db_transaction_generator() def origin_get_range(self, origin_from=1, origin_count=100, db=None, cur=None): - """Retrieve ``origin_count`` origins whose ids are greater - or equal than ``origin_from``. - - Origins are sorted by id before retrieving them. - - Args: - origin_from (int): the minimum id of origins to retrieve - origin_count (int): the maximum number of origins to retrieve - - Yields: - dicts containing origin information as returned - by :meth:`swh.storage.storage.Storage.origin_get`. - """ for origin in db.origin_get_range(origin_from, origin_count, cur): yield dict(zip(db.origin_get_range_cols, origin)) - @remote_api_endpoint('origin/list') @timed @db_transaction() def origin_list(self, page_token: Optional[str] = None, limit: int = 100, *, db=None, cur=None) -> dict: - """Returns the list of origins - - Args: - page_token: opaque token used for pagination. - limit: the maximum number of results to return - - Returns: - dict: dict with the following keys: - - **next_page_token** (str, optional): opaque token to be used as - `page_token` for retrieving the next page. if absent, there is - no more pages to gather. - - **origins** (List[dict]): list of origins, as returned by - `origin_get`. - """ page_token = page_token or '0' if not isinstance(page_token, str): raise TypeError('page_token must be a string.') @@ -1814,70 +1013,23 @@ return result - @remote_api_endpoint('origin/search') @timed @db_transaction_generator() def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False, db=None, cur=None): - """Search for origins whose urls contain a provided string pattern - or match a provided regular expression. - The search is performed in a case insensitive way. - - Args: - url_pattern (str): the string pattern to search for in origin urls - offset (int): number of found origins to skip before returning - results - limit (int): the maximum number of found origins to return - regexp (bool): if True, consider the provided pattern as a regular - expression and return origins whose urls match it - with_visit (bool): if True, filter out origins with no visit - - Yields: - dicts containing origin information as returned - by :meth:`swh.storage.storage.Storage.origin_get`. - """ for origin in db.origin_search(url_pattern, offset, limit, regexp, with_visit, cur): yield dict(zip(db.origin_cols, origin)) - @remote_api_endpoint('origin/count') @timed @db_transaction() def origin_count(self, url_pattern, regexp=False, with_visit=False, db=None, cur=None): - """Count origins whose urls contain a provided string pattern - or match a provided regular expression. - The pattern search in origin urls is performed in a case insensitive - way. - - Args: - url_pattern (str): the string pattern to search for in origin urls - regexp (bool): if True, consider the provided pattern as a regular - expression and return origins whose urls match it - with_visit (bool): if True, filter out origins with no visit - - Returns: - int: The number of origins matching the search criterion. - """ return db.origin_count(url_pattern, regexp, with_visit, cur) - @remote_api_endpoint('origin/add_multi') @timed @db_transaction() def origin_add(self, origins, db=None, cur=None): - """Add origins to the storage - - Args: - origins: list of dictionaries representing the individual origins, - with the following keys: - - - type: the origin type ('git', 'svn', 'deb', ...) - - url (bytes): the url the origin points to - - Returns: - list: given origins as dict updated with their id - - """ origins = copy.deepcopy(list(origins)) for origin in origins: self.origin_add_one(origin, db=db, cur=cur) @@ -1885,24 +1037,9 @@ send_metric('origin:add', count=len(origins), method_name='origin_add') return origins - @remote_api_endpoint('origin/add') @timed @db_transaction() def origin_add_one(self, origin, db=None, cur=None): - """Add origin to the storage - - Args: - origin: dictionary representing the individual origin to add. This - dict has the following keys: - - - type (FIXME: enum TBD): the origin type ('git', 'wget', ...) - - url (bytes): the url the origin points to - - Returns: - the id of the added origin, or of the identical one that already - exists. - - """ origin_row = list(db.origin_get_by_url([origin['url']], cur))[0] origin_url = dict(zip(db.origin_cols, origin_row))['url'] if origin_url: @@ -1917,18 +1054,10 @@ @db_transaction(statement_timeout=500) def stat_counters(self, db=None, cur=None): - """compute statistics about the number of tuples in various tables - - Returns: - dict: a dictionary mapping textual labels (e.g., content) to - integer values (e.g., the number of tuples in table content) - - """ return {k: v for (k, v) in db.stat_counters()} @db_transaction() def refresh_stat_counters(self, db=None, cur=None): - """Recomputes the statistics for `stat_counters`.""" keys = [ 'content', 'directory', @@ -1947,21 +1076,10 @@ for key in keys: cur.execute('select * from swh_update_counter(%s)', (key,)) - @remote_api_endpoint('origin/metadata/add') @timed @db_transaction() def origin_metadata_add(self, origin_url, ts, provider, tool, metadata, db=None, cur=None): - """ Add an origin_metadata for the origin at ts with provenance and - metadata. - - Args: - origin_url (str): the origin url for which the metadata is added - ts (datetime): timestamp of the found metadata - provider (int): the provider of metadata (ex:'hal') - tool (int): tool used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location - """ if isinstance(ts, str): ts = dateutil.parser.parse(ts) @@ -1970,54 +1088,16 @@ send_metric( 'origin_metadata:add', count=1, method_name='origin_metadata_add') - @remote_api_endpoint('origin/metadata/get') @timed @db_transaction_generator(statement_timeout=500) def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, cur=None): - """Retrieve list of all origin_metadata entries for the origin_id - - Args: - origin_url (str): the origin's URL - provider_type (str): (optional) type of provider - - Returns: - list of dicts: the origin_metadata dictionary with the keys: - - - origin_id (int): origin's id - - discovery_date (datetime): timestamp of discovery - - tool_id (int): metadata's extracting tool - - metadata (jsonb) - - provider_id (int): metadata's provider - - provider_name (str) - - provider_type (str) - - provider_url (str) - - """ for line in db.origin_metadata_get_by(origin_url, provider_type, cur): yield dict(zip(db.origin_metadata_get_cols, line)) - @remote_api_endpoint('tool/add') @timed @db_transaction() def tool_add(self, tools, db=None, cur=None): - """Add new tools to the storage. - - Args: - tools (iterable of :class:`dict`): Tool information to add to - storage. Each tool is a :class:`dict` with the following keys: - - - name (:class:`str`): name of the tool - - version (:class:`str`): version of the tool - - configuration (:class:`dict`): configuration of the tool, - must be json-encodable - - Returns: - :class:`dict`: All the tools inserted in storage - (including the internal ``id``). The order of the list is not - guaranteed to match the order of the initial list. - - """ db.mktemp_tool(cur) db.copy_to(tools, 'tmp_tool', ['name', 'version', 'configuration'], @@ -2028,21 +1108,9 @@ send_metric('tool:add', count=len(results), method_name='tool_add') return results - @remote_api_endpoint('tool/data') @timed @db_transaction(statement_timeout=500) def tool_get(self, tool, db=None, cur=None): - """Retrieve tool information. - - Args: - tool (dict): Tool information we want to retrieve from storage. - The dicts have the same keys as those used in :func:`tool_add`. - - Returns: - dict: The full tool information if it exists (``id`` included), - None otherwise. - - """ tool_conf = tool['configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) @@ -2054,118 +1122,41 @@ return None return dict(zip(db.tool_cols, idx)) - @remote_api_endpoint('provider/add') @timed @db_transaction() def metadata_provider_add(self, provider_name, provider_type, provider_url, metadata, db=None, cur=None): - """Add a metadata provider. - - Args: - provider_name (str): Its name - provider_type (str): Its type (eg. `'deposit-client'`) - provider_url (str): Its URL - metadata: JSON-encodable object - - Returns: - int: an identifier of the provider - """ result = db.metadata_provider_add(provider_name, provider_type, provider_url, metadata, cur) send_metric( 'metadata_provider:add', count=1, method_name='metadata_provider') return result - @remote_api_endpoint('provider/get') @timed @db_transaction() def metadata_provider_get(self, provider_id, db=None, cur=None): - """Get a metadata provider - - Args: - provider_id: Its identifier, as given by `metadata_provider_add`. - - Returns: - dict: same as `metadata_provider_add`; - or None if it does not exist. - """ result = db.metadata_provider_get(provider_id) if not result: return None return dict(zip(db.metadata_provider_cols, result)) - @remote_api_endpoint('provider/getby') @timed @db_transaction() def metadata_provider_get_by(self, provider, db=None, cur=None): - """Get a metadata provider - - Args: - provider (dict): A dictionary with keys: - * provider_name: Its name - * provider_url: Its URL - - Returns: - dict: same as `metadata_provider_add`; - or None if it does not exist. - """ result = db.metadata_provider_get_by(provider['provider_name'], provider['provider_url']) if not result: return None return dict(zip(db.metadata_provider_cols, result)) - @remote_api_endpoint('algos/diff_directories') @timed def diff_directories(self, from_dir, to_dir, track_renaming=False): - """Compute the list of file changes introduced between two arbitrary - directories (insertion / deletion / modification / renaming of files). - - Args: - from_dir (bytes): identifier of the directory to compare from - to_dir (bytes): identifier of the directory to compare to - track_renaming (bool): whether or not to track files renaming - - Returns: - A list of dict describing the introduced file changes - (see :func:`swh.storage.algos.diff.diff_directories` - for more details). - """ return diff.diff_directories(self, from_dir, to_dir, track_renaming) - @remote_api_endpoint('algos/diff_revisions') @timed def diff_revisions(self, from_rev, to_rev, track_renaming=False): - """Compute the list of file changes introduced between two arbitrary - revisions (insertion / deletion / modification / renaming of files). - - Args: - from_rev (bytes): identifier of the revision to compare from - to_rev (bytes): identifier of the revision to compare to - track_renaming (bool): whether or not to track files renaming - - Returns: - A list of dict describing the introduced file changes - (see :func:`swh.storage.algos.diff.diff_directories` - for more details). - """ return diff.diff_revisions(self, from_rev, to_rev, track_renaming) - @remote_api_endpoint('algos/diff_revision') @timed def diff_revision(self, revision, track_renaming=False): - """Compute the list of file changes introduced by a specific revision - (insertion / deletion / modification / renaming of files) by comparing - it against its first parent. - - Args: - revision (bytes): identifier of the revision from which to - compute the list of files changes - track_renaming (bool): whether or not to track files renaming - - Returns: - A list of dict describing the introduced file changes - (see :func:`swh.storage.algos.diff.diff_directories` - for more details). - """ return diff.diff_revision(self, revision, track_renaming) diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -6,6 +6,7 @@ import copy from contextlib import contextmanager import datetime +import inspect import itertools import math import queue @@ -28,6 +29,7 @@ from swh.model.hypothesis_strategies import objects from swh.storage import HashCollision from swh.storage.converters import origin_url_to_sha1 as sha1 +from swh.storage.interface import StorageInterface from .storage_data import data @@ -95,6 +97,34 @@ """ maxDiff = None # type: ClassVar[Optional[int]] + def test_types(self, swh_storage): + """Checks all methods of StorageInterface are implemented by this + backend, and that they have the same signature.""" + # Create an instance of the protocol (which cannot be instantiated + # directly, so this creates a subclass, then instantiates it) + interface = type('_', (StorageInterface,), {})() + + assert 'content_add' in dir(interface) + + missing_methods = [] + + for meth_name in dir(interface): + if meth_name.startswith('_'): + continue + interface_meth = getattr(interface, meth_name) + try: + concrete_meth = getattr(swh_storage, meth_name) + except AttributeError: + missing_methods.append(meth_name) + continue + + expected_signature = inspect.signature(interface_meth) + actual_signature = inspect.signature(concrete_meth) + + assert expected_signature == actual_signature, meth_name + + assert missing_methods == [] + def test_check_config(self, swh_storage): assert swh_storage.check_config(check_write=True) assert swh_storage.check_config(check_write=False)