Page MenuHomeSoftware Heritage

D2587.id.diff
No OneTemporary

D2587.id.diff

diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -6,13 +6,13 @@
from swh.core.api import RPCClient
from ..exc import StorageAPIError
-from ..storage import Storage
+from ..interface import StorageInterface
class RemoteStorage(RPCClient):
"""Proxy to a remote storage API"""
api_exception = StorageAPIError
- backend_class = Storage
+ backend_class = StorageInterface
def reset(self):
return self.post('reset', {})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -12,7 +12,7 @@
error_handler,
encode_data_server as encode_data)
-from ..storage import Storage
+from ..interface import StorageInterface
from ..metrics import timed
@@ -25,7 +25,7 @@
app = RPCServerApp(__name__,
- backend_class=Storage,
+ backend_class=StorageInterface,
backend_factory=get_storage)
storage = None
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -72,7 +72,6 @@
self.objstorage = get_objstorage('memory', {})
def check_config(self, *, check_write):
- """Check that the storage is configured and ready to go."""
return True
def _content_add(self, contents, with_data):
@@ -159,66 +158,18 @@
return count
def _content_to_model(self, contents):
- """Takes a list of content dicts, optionally with an extra 'origin'
- key, and yields tuples (model.Content, origin)."""
for content in contents:
content = content.copy()
content.pop('origin', None)
yield Content.from_dict(content)
def content_add(self, content):
- """Add content blobs to the storage
-
- Args:
- content (iterable): iterable of dictionaries representing
- individual pieces of content to add. Each dictionary has the
- following keys:
-
- - data (bytes): the actual content
- - length (int): content length (default: -1)
- - one key for each checksum algorithm in
- :data:`swh.model.hashutil.DEFAULT_ALGORITHMS`, mapped to the
- corresponding checksum
- - status (str): one of visible, hidden, absent
- - reason (str): if status = absent, the reason why
- - origin (int): if status = absent, the origin we saw the
- content in
-
- Raises:
- HashCollision in case of collision
-
- Returns:
- Summary dict with the following key and associated values:
-
- content:add: New contents added
- content_bytes:add: Sum of the contents' length data
- skipped_content:add: New skipped contents (no data) added
-
- """
now = datetime.datetime.now(tz=datetime.timezone.utc)
content = [attr.evolve(c, ctime=now)
for c in self._content_to_model(content)]
return self._content_add(content, with_data=True)
def content_update(self, content, keys=[]):
- """Update content blobs to the storage. Does nothing for unknown
- contents or skipped ones.
-
- Args:
- content (iterable): iterable of dictionaries representing
- individual pieces of content to update. Each dictionary has the
- following keys:
-
- - data (bytes): the actual content
- - length (int): content length (default: -1)
- - one key for each checksum algorithm in
- :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
- corresponding checksum
- - status (str): one of visible, hidden, absent
-
- keys (list): List of keys (str) whose values needs an update, e.g.,
- new hash column
- """
if self.journal_writer:
raise NotImplementedError(
'content_update is not yet supported with a journal_writer.')
@@ -243,58 +194,10 @@
self._content_indexes[algorithm][hash_].add(new_key)
def content_add_metadata(self, content):
- """Add content metadata to the storage (like `content_add`, but
- without inserting to the objstorage).
-
- Args:
- content (iterable): iterable of dictionaries representing
- individual pieces of content to add. Each dictionary has the
- following keys:
-
- - length (int): content length (default: -1)
- - one key for each checksum algorithm in
- :data:`swh.model.hashutil.DEFAULT_ALGORITHMS`, mapped to the
- corresponding checksum
- - status (str): one of visible, hidden, absent
- - reason (str): if status = absent, the reason why
- - origin (int): if status = absent, the origin we saw the
- content in
- - ctime (datetime): time of insertion in the archive
-
- Raises:
- HashCollision in case of collision
-
- Returns:
- Summary dict with the following key and associated values:
-
- content:add: New contents added
- skipped_content:add: New skipped contents (no data) added
-
- """
content = list(self._content_to_model(content))
return self._content_add(content, with_data=False)
def content_get(self, content):
- """Retrieve in bulk contents and their data.
-
- This function may yield more blobs than provided sha1 identifiers,
- in case they collide.
-
- Args:
- content: iterables of sha1
-
- Yields:
- Dict[str, bytes]: Generates streams of contents as dict with their
- raw data:
-
- - sha1 (bytes): content id
- - data (bytes): content's raw data
-
- Raises:
- ValueError in case of too much contents are required.
- cf. BULK_BLOCK_CONTENT_LEN_MAX
-
- """
# FIXME: Make this method support slicing the `data`.
if len(content) > BULK_BLOCK_CONTENT_LEN_MAX:
raise ValueError(
@@ -309,26 +212,6 @@
yield {'sha1': obj_id, 'data': data}
def content_get_range(self, start, end, limit=1000):
- """Retrieve contents within range [start, end] bound by limit.
-
- Note that this function may return more than one blob per hash. The
- limit is enforced with multiplicity (ie. two blobs with the same hash
- will count twice toward the limit).
-
- Args:
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **limit** (int): Limit result (default to 1000)
-
- Returns:
- a dict with keys:
- - contents [dict]: iterable of contents in between the range.
- - next (bytes): There remains content in the range
- starting from this next sha1
-
- """
if limit is None:
raise ValueError('Development error: limit should not be None')
from_index = bisect.bisect_left(self._sorted_sha1s, start)
@@ -353,25 +236,6 @@
def content_get_partition(
self, partition_id: int, nb_partitions: int, limit: int = 1000,
page_token: str = None):
- """Splits contents into nb_partitions, and returns one of these based on
- partition_id (which must be in [0, nb_partitions-1])
-
- There is no guarantee on how the partitioning is done, or the
- result order.
-
- Args:
- partition_id (int): index of the partition to fetch
- nb_partitions (int): total number of partitions to split into
- limit (int): Limit result (default to 1000)
- page_token (Optional[str]): opaque token used for pagination.
-
- Returns:
- a dict with keys:
- - contents (List[dict]): iterable of contents in the partition.
- - **next_page_token** (Optional[str]): opaque token to be used as
- `page_token` for retrieving the next page. if absent, there is
- no more pages to gather.
- """
if limit is None:
raise ValueError('Development error: limit should not be None')
(start, end) = get_partition_bounds_bytes(
@@ -391,17 +255,6 @@
def content_get_metadata(
self, contents: List[bytes]) -> Dict[bytes, List[Dict]]:
- """Retrieve content metadata in bulk
-
- Args:
- content: iterable of content identifiers (sha1)
-
- Returns:
- a dict with keys the content's sha1 and the associated value
- either the existing content's metadata or None if the content does
- not exist.
-
- """
result: Dict = {sha1: [] for sha1 in contents}
for sha1 in contents:
if sha1 in self._content_indexes['sha1']:
@@ -433,22 +286,6 @@
return [self._contents[key].to_dict() for key in keys]
def content_missing(self, content, key_hash='sha1'):
- """List content missing from storage
-
- Args:
- contents ([dict]): iterable of dictionaries whose keys are
- either 'length' or an item of
- :data:`swh.model.hashutil.ALGORITHMS`;
- mapped to the corresponding checksum
- (or length).
-
- key_hash (str): name of the column to use as hash id
- result (default: 'sha1')
-
- Returns:
- iterable ([bytes]): missing content ids (as per the
- key_hash column)
- """
for cont in content:
for (algo, hash_) in cont.items():
if algo not in DEFAULT_ALGORITHMS:
@@ -462,45 +299,16 @@
yield cont[key_hash]
def content_missing_per_sha1(self, contents):
- """List content missing from storage based only on sha1.
-
- Args:
- contents: Iterable of sha1 to check for absence.
-
- Returns:
- iterable: missing ids
-
- Raises:
- TODO: an exception when we get a hash collision.
-
- """
for content in contents:
if content not in self._content_indexes['sha1']:
yield content
def content_missing_per_sha1_git(self, contents):
- """List content missing from storage based only on sha1_git.
-
- Args:
- contents: An iterable of content id (sha1_git)
-
- Yields:
- missing contents sha1_git
- """
for content in contents:
if content not in self._content_indexes['sha1_git']:
yield content
def skipped_content_missing(self, contents):
- """List all skipped_content missing from storage
-
- Args:
- contents: Iterable of sha1 to check for skipped content entry
-
- Returns:
- iterable: dict of skipped content entry
- """
-
for content in contents:
for (key, algorithm) in self._content_key_algorithm(content):
if algorithm == 'blake2s256':
@@ -512,37 +320,9 @@
break
def content_get_random(self):
- """Finds a random content id.
-
- Returns:
- a sha1_git
- """
return random.choice(list(self._content_indexes['sha1_git']))
def directory_add(self, directories):
- """Add directories to the storage
-
- Args:
- directories (iterable): iterable of dictionaries representing the
- individual directories to add. Each dict has the following
- keys:
-
- - id (sha1_git): the id of the directory to add
- - entries (list): list of dicts for each entry in the
- directory. Each dict has the following keys:
-
- - name (bytes)
- - type (one of 'file', 'dir', 'rev'): type of the
- directory entry (file, directory, revision)
- - target (sha1_git): id of the object pointed at by the
- directory entry
- - perms (int): entry permissions
- Returns:
- Summary dict of keys with associated count as values:
-
- directory:add: Number of directories actually added
-
- """
directories = list(directories)
if self.journal_writer:
self.journal_writer.write_additions(
@@ -563,15 +343,6 @@
return {'directory:add': count}
def directory_missing(self, directories):
- """List directories missing from storage
-
- Args:
- directories (iterable): an iterable of directory ids
-
- Yields:
- missing directory ids
-
- """
for id in directories:
if id not in self._directories:
yield id
@@ -607,41 +378,12 @@
ret['target'], True, prefix + ret['name'] + b'/')
def directory_ls(self, directory, recursive=False):
- """Get entries for one directory.
-
- Args:
- - directory: the directory to list entries from.
- - recursive: if flag on, this list recursively from this directory.
-
- Returns:
- List of entries for such directory.
-
- If `recursive=True`, names in the path of a dir/file not at the
- root are concatenated with a slash (`/`).
- """
yield from self._directory_ls(directory, recursive)
def directory_entry_get_by_path(self, directory, paths):
- """Get the directory entry (either file or dir) from directory with path.
-
- Args:
- - directory: sha1 of the top level directory
- - paths: path to lookup from the top level directory. From left
- (top) to right (bottom).
-
- Returns:
- The corresponding directory entry if found, None otherwise.
-
- """
return self._directory_entry_get_by_path(directory, paths, b'')
def directory_get_random(self):
- """Finds a random directory id.
-
- Returns:
- a sha1_git if any
-
- """
if not self._directories:
return None
return random.choice(list(self._directories))
@@ -674,42 +416,6 @@
first_item['target'], paths[1:], prefix + paths[0] + b'/')
def revision_add(self, revisions):
- """Add revisions to the storage
-
- Args:
- revisions (Iterable[dict]): iterable of dictionaries representing
- the individual revisions to add. Each dict has the following
- keys:
-
- - **id** (:class:`sha1_git`): id of the revision to add
- - **date** (:class:`dict`): date the revision was written
- - **committer_date** (:class:`dict`): date the revision got
- added to the origin
- - **type** (one of 'git', 'tar'): type of the
- revision added
- - **directory** (:class:`sha1_git`): the directory the
- revision points at
- - **message** (:class:`bytes`): the message associated with
- the revision
- - **author** (:class:`Dict[str, bytes]`): dictionary with
- keys: name, fullname, email
- - **committer** (:class:`Dict[str, bytes]`): dictionary with
- keys: name, fullname, email
- - **metadata** (:class:`jsonb`): extra information as
- dictionary
- - **synthetic** (:class:`bool`): revision's nature (tarball,
- directory creates synthetic revision`)
- - **parents** (:class:`list[sha1_git]`): the parents of
- this revision
-
- date dictionaries have the form defined in :mod:`swh.model`.
-
- Returns:
- Summary dict of keys with associated count as values
-
- revision_added: New objects actually stored in db
-
- """
revisions = list(revisions)
if self.journal_writer:
self.journal_writer.write_additions(
@@ -734,15 +440,6 @@
return {'revision:add': count}
def revision_missing(self, revisions):
- """List revisions missing from storage
-
- Args:
- revisions (iterable): revision ids
-
- Yields:
- missing revision ids
-
- """
for id in revisions:
if id not in self._revisions:
yield id
@@ -765,68 +462,18 @@
yield from self._get_parent_revs(parent, seen, limit)
def revision_log(self, revisions, limit=None):
- """Fetch revision entry from the given root revisions.
-
- Args:
- revisions: array of root revision to lookup
- limit: limitation on the output result. Default to None.
-
- Yields:
- List of revision log from such revisions root.
-
- """
seen = set()
for rev_id in revisions:
yield from self._get_parent_revs(rev_id, seen, limit)
def revision_shortlog(self, revisions, limit=None):
- """Fetch the shortlog for the given revisions
-
- Args:
- revisions: list of root revisions to lookup
- limit: depth limitation for the output
-
- Yields:
- a list of (id, parents) tuples.
-
- """
yield from ((rev['id'], rev['parents'])
for rev in self.revision_log(revisions, limit))
def revision_get_random(self):
- """Finds a random revision id.
-
- Returns:
- a sha1_git
- """
return random.choice(list(self._revisions))
def release_add(self, releases):
- """Add releases to the storage
-
- Args:
- releases (Iterable[dict]): iterable of dictionaries representing
- the individual releases to add. Each dict has the following
- keys:
-
- - **id** (:class:`sha1_git`): id of the release to add
- - **revision** (:class:`sha1_git`): id of the revision the
- release points to
- - **date** (:class:`dict`): the date the release was made
- - **name** (:class:`bytes`): the name of the release
- - **comment** (:class:`bytes`): the comment associated with
- the release
- - **author** (:class:`Dict[str, bytes]`): dictionary with
- keys: name, fullname, email
-
- the date dictionary has the form defined in :mod:`swh.model`.
-
- Returns:
- Summary dict of keys with associated count as values
-
- release:add: New objects contents actually stored in db
-
- """
releases = list(releases)
if self.journal_writer:
self.journal_writer.write_additions(
@@ -849,28 +496,9 @@
return {'release:add': count}
def release_missing(self, releases):
- """List releases missing from storage
-
- Args:
- releases: an iterable of release ids
-
- Returns:
- a list of missing release ids
-
- """
yield from (rel for rel in releases if rel not in self._releases)
def release_get(self, releases):
- """Given a list of sha1, return the releases's information
-
- Args:
- releases: list of sha1s
-
- Yields:
- dicts with the same keys as those given to `release_add`
- (or ``None`` if a release does not exist)
-
- """
for rel_id in releases:
if rel_id in self._releases:
yield self._releases[rel_id].to_dict()
@@ -878,42 +506,9 @@
yield None
def release_get_random(self):
- """Finds a random release id.
-
- Returns:
- a sha1_git
- """
return random.choice(list(self._releases))
def snapshot_add(self, snapshots):
- """Add a snapshot to the storage
-
- Args:
- snapshot ([dict]): the snapshots to add, containing the
- following keys:
-
- - **id** (:class:`bytes`): id of the snapshot
- - **branches** (:class:`dict`): branches the snapshot contains,
- mapping the branch name (:class:`bytes`) to the branch target,
- itself a :class:`dict` (or ``None`` if the branch points to an
- unknown object)
-
- - **target_type** (:class:`str`): one of ``content``,
- ``directory``, ``revision``, ``release``,
- ``snapshot``, ``alias``
- - **target** (:class:`bytes`): identifier of the target
- (currently a ``sha1_git`` for all object kinds, or the name
- of the target branch for aliases)
-
- Raises:
- ValueError: if the origin's or visit's identifier does not exist.
-
- Returns:
- Summary dict of keys with associated count as values
-
- snapshot_added: Count of object actually stored in db
-
- """
count = 0
snapshots = (Snapshot.from_dict(d) for d in snapshots)
snapshots = (snap for snap in snapshots
@@ -930,67 +525,14 @@
return {'snapshot:add': count}
def snapshot_missing(self, snapshots):
- """List snapshot missing from storage
-
- Args:
- snapshots (iterable): an iterable of snapshot ids
-
- Yields:
- missing snapshot ids
- """
for id in snapshots:
if id not in self._snapshots:
yield id
def snapshot_get(self, snapshot_id):
- """Get the content, possibly partial, of a snapshot with the given id
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
-
- Args:
- snapshot_id (bytes): identifier of the snapshot
- Returns:
- dict: a dict with three keys:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
- """
return self.snapshot_get_branches(snapshot_id)
def snapshot_get_by_origin_visit(self, origin, visit):
- """Get the content, possibly partial, of a snapshot for the given origin visit
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
-
- Args:
- origin (int): the origin's identifier
- visit (int): the visit's identifier
- Returns:
- dict: None if the snapshot does not exist;
- a dict with three keys otherwise:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
-
- """
origin_url = self._get_origin_url(origin)
if not origin_url:
return
@@ -1005,33 +547,6 @@
return None
def snapshot_get_latest(self, origin, allowed_statuses=None):
- """Get the content, possibly partial, of the latest snapshot for the
- given origin, optionally only from visits that have one of the given
- allowed_statuses
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the methods :meth:`origin_visit_get_latest`
- and :meth:`snapshot_get_branches` should be used instead.
-
- Args:
- origin (str): the origin's URL
- allowed_statuses (list of str): list of visit statuses considered
- to find the latest snapshot for the origin. For instance,
- ``allowed_statuses=['full']`` will only consider visits that
- have successfully run to completion.
- Returns:
- dict: a dict with three keys:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
- """
origin_url = self._get_origin_url(origin)
if not origin_url:
return
@@ -1048,46 +563,12 @@
return snapshot
def snapshot_count_branches(self, snapshot_id):
- """Count the number of branches in the snapshot with the given id
-
- Args:
- snapshot_id (bytes): identifier of the snapshot
-
- Returns:
- dict: A dict whose keys are the target types of branches and
- values their corresponding amount
- """
(snapshot, _) = self._snapshots[snapshot_id]
return collections.Counter(branch.target_type.value if branch else None
for branch in snapshot.branches.values())
def snapshot_get_branches(self, snapshot_id, branches_from=b'',
branches_count=1000, target_types=None):
- """Get the content, possibly partial, of a snapshot with the given id
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- Args:
- snapshot_id (bytes): identifier of the snapshot
- branches_from (bytes): optional parameter used to skip branches
- whose name is lesser than it before returning them
- branches_count (int): optional parameter used to restrain
- the amount of returned branches
- target_types (list): optional parameter used to filter the
- target types of branch to return (possible values that can be
- contained in that list are `'content', 'directory',
- 'revision', 'release', 'snapshot', 'alias'`)
- Returns:
- dict: None if the snapshot does not exist;
- a dict with three keys otherwise:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than
- `branches_count` branches after `branches_from` included.
- """
res = self._snapshots.get(snapshot_id)
if res is None:
return None
@@ -1126,27 +607,9 @@
}
def snapshot_get_random(self):
- """Finds a random snapshot id.
-
- Returns:
- a sha1_git
- """
return random.choice(list(self._snapshots))
def object_find_by_sha1_git(self, ids):
- """Return the objects found with the given ids.
-
- Args:
- ids: a generator of sha1_gits
-
- Returns:
- dict: a mapping from id to the list of objects found. Each object
- found is itself a dict with keys:
-
- - sha1_git: the input id
- - type: the type of object found
-
- """
ret = {}
for id_ in ids:
objs = self._objects.get(id_, [])
@@ -1163,30 +626,6 @@
return t.to_dict()
def origin_get(self, origins):
- """Return origins, either all identified by their ids or all
- identified by urls.
-
- Args:
- origin: a list of dictionaries representing the individual
- origins to find.
- These dicts have either the key url (and optionally type):
-
- - url (bytes): the url the origin points to
-
- or the id:
-
- - id (int): the origin's identifier
-
- Returns:
- dict: the origin dictionary with the keys:
-
- - id: origin's id
- - url: origin's url
-
- Raises:
- ValueError: if the keys does not match (url and type) nor id.
-
- """
if isinstance(origins, dict):
# Old API
return_single = True
@@ -1223,36 +662,12 @@
return results
def origin_get_by_sha1(self, sha1s):
- """Return origins, identified by the sha1 of their URLs.
-
- Args:
- sha1s (list[bytes]): a list of sha1s
-
- Yields:
- dicts containing origin information as returned
- by :meth:`swh.storage.in_memory.Storage.origin_get`, or None if an
- origin matching the sha1 is not found.
- """
return [
self._convert_origin(self._origins_by_sha1.get(sha1))
for sha1 in sha1s
]
def origin_get_range(self, origin_from=1, origin_count=100):
- """Retrieve ``origin_count`` origins whose ids are greater
- or equal than ``origin_from``.
-
- Origins are sorted by id before retrieving them.
-
- Args:
- origin_from (int): the minimum id of origins to retrieve
- origin_count (int): the maximum number of origins to retrieve
-
- Yields:
- dicts containing origin information as returned
- by :meth:`swh.storage.in_memory.Storage.origin_get`, plus
- an 'id' key.
- """
origin_from = max(origin_from, 1)
if origin_from <= len(self._origins_by_id):
max_idx = origin_from + origin_count - 1
@@ -1265,20 +680,6 @@
def origin_list(self, page_token: Optional[str] = None, limit: int = 100
) -> dict:
- """Returns the list of origins
-
- Args:
- page_token: opaque token used for pagination.
- limit: the maximum number of results to return
-
- Returns:
- dict: dict with the following keys:
- - **next_page_token** (str, optional): opaque token to be used as
- `page_token` for retrieving the next page. if absent, there is
- no more pages to gather.
- - **origins** (List[dict]): list of origins, as returned by
- `origin_get`.
- """
origin_urls = sorted(self._origins)
if page_token:
from_ = bisect.bisect_left(origin_urls, page_token)
@@ -1297,23 +698,6 @@
def origin_search(self, url_pattern, offset=0, limit=50,
regexp=False, with_visit=False):
- """Search for origins whose urls contain a provided string pattern
- or match a provided regular expression.
- The search is performed in a case insensitive way.
-
- Args:
- url_pattern (str): the string pattern to search for in origin urls
- offset (int): number of found origins to skip before returning
- results
- limit (int): the maximum number of found origins to return
- regexp (bool): if True, consider the provided pattern as a regular
- expression and return origins whose urls match it
- with_visit (bool): if True, filter out origins with no visit
-
- Returns:
- An iterable of dict containing origin information as returned
- by :meth:`swh.storage.storage.Storage.origin_get`.
- """
origins = map(self._convert_origin, self._origins.values())
if regexp:
pat = re.compile(url_pattern)
@@ -1332,56 +716,17 @@
return origins[offset:offset+limit]
def origin_count(self, url_pattern, regexp=False, with_visit=False):
- """Count origins whose urls contain a provided string pattern
- or match a provided regular expression.
- The pattern search in origin urls is performed in a case insensitive
- way.
-
- Args:
- url_pattern (str): the string pattern to search for in origin urls
- regexp (bool): if True, consider the provided pattern as a regular
- expression and return origins whose urls match it
- with_visit (bool): if True, filter out origins with no visit
-
- Returns:
- int: The number of origins matching the search criterion.
- """
return len(self.origin_search(url_pattern, regexp=regexp,
with_visit=with_visit,
limit=len(self._origins)))
def origin_add(self, origins):
- """Add origins to the storage
-
- Args:
- origins: list of dictionaries representing the individual origins,
- with the following keys:
-
- - url (bytes): the url the origin points to
-
- Returns:
- list: given origins as dict updated with their id
-
- """
origins = copy.deepcopy(list(origins))
for origin in origins:
self.origin_add_one(origin)
return origins
def origin_add_one(self, origin):
- """Add origin to the storage
-
- Args:
- origin: dictionary representing the individual origin to add. This
- dict has the following keys:
-
- - url (bytes): the url the origin points to
-
- Returns:
- the id of the added origin, or of the identical one that already
- exists.
-
- """
origin = Origin.from_dict(origin)
if origin.url not in self._origins:
if self.journal_writer:
@@ -1401,20 +746,6 @@
return origin.url
def origin_visit_add(self, origin, date, type):
- """Add an origin_visit for the origin at date with status 'ongoing'.
-
- Args:
- origin (str): visited origin's identifier or URL
- date (Union[str,datetime]): timestamp of such visit
- type (str): the type of loader used for the visit (hg, git, ...)
-
- Returns:
- dict: dictionary with keys origin and visit where:
-
- - origin: origin's identifier
- - visit: the visit's identifier for the new visit occurrence
-
- """
origin_url = origin
if origin_url is None:
raise ValueError('Unknown origin.')
@@ -1456,20 +787,6 @@
def origin_visit_update(self, origin, visit_id, status=None,
metadata=None, snapshot=None):
- """Update an origin_visit's status.
-
- Args:
- origin (str): visited origin's URL
- visit_id (int): visit's identifier
- status: visit's new status
- metadata: data associated to the visit
- snapshot (sha1_git): identifier of the snapshot to add to
- the visit
-
- Returns:
- None
-
- """
if not isinstance(origin, str):
raise TypeError('origin must be a string, not %r' % (origin,))
origin_url = self._get_origin_url(origin)
@@ -1498,22 +815,6 @@
self._origin_visits[origin_url][visit_id-1] = visit
def origin_visit_upsert(self, visits):
- """Add a origin_visits with a specific id and with all its data.
- If there is already an origin_visit with the same
- `(origin_url, visit_id)`, updates it instead of inserting a new one.
-
- Args:
- visits: iterable of dicts with keys:
-
- - **origin**: origin url
- - **visit**: origin visit id
- - **type**: type of loader used for the visit
- - **date**: timestamp of such visit
- - **status**: Visit's new status
- - **metadata**: Data associated to the visit
- - **snapshot**: identifier of the snapshot to add to
- the visit
- """
for visit in visits:
if not isinstance(visit['origin'], str):
raise TypeError("visit['origin'] must be a string, not %r"
@@ -1547,19 +848,6 @@
return visit
def origin_visit_get(self, origin, last_visit=None, limit=None):
- """Retrieve all the origin's visit's information.
-
- Args:
- origin (int): the origin's identifier
- last_visit (int): visit's id from which listing the next ones,
- default to None
- limit (int): maximum number of results to return,
- default to None
-
- Yields:
- List of visits.
-
- """
origin_url = self._get_origin_url(origin)
if origin_url in self._origin_visits:
visits = self._origin_visits[origin_url]
@@ -1576,18 +864,6 @@
self._origin_visits[origin_url][visit_id-1])
def origin_visit_find_by_date(self, origin, visit_date):
- """Retrieves the origin visit whose date is closest to the provided
- timestamp.
- In case of a tie, the visit with largest id is selected.
-
- Args:
- origin (str): The occurrence's origin (URL).
- target (datetime): target timestamp
-
- Returns:
- A visit.
-
- """
origin_url = self._get_origin_url(origin)
if origin_url in self._origin_visits:
visits = self._origin_visits[origin_url]
@@ -1597,16 +873,6 @@
return self._convert_visit(visit)
def origin_visit_get_by(self, origin, visit):
- """Retrieve origin visit's information.
-
- Args:
- origin (int): the origin's identifier
-
- Returns:
- The information on that particular (origin, visit) or None if
- it does not exist
-
- """
origin_url = self._get_origin_url(origin)
if origin_url in self._origin_visits and \
visit <= len(self._origin_visits[origin_url]):
@@ -1615,31 +881,6 @@
def origin_visit_get_latest(
self, origin, allowed_statuses=None, require_snapshot=False):
- """Get the latest origin visit for the given origin, optionally
- looking only for those with one of the given allowed_statuses
- or for those with a known snapshot.
-
- Args:
- origin (str): the origin's URL
- allowed_statuses (list of str): list of visit statuses considered
- to find the latest visit. For instance,
- ``allowed_statuses=['full']`` will only consider visits that
- have successfully run to completion.
- require_snapshot (bool): If True, only a visit with a snapshot
- will be returned.
-
- Returns:
- dict: a dict with the following keys:
-
- - **origin**: the URL of the origin
- - **visit**: origin visit id
- - **type**: type of loader used for the visit
- - **date**: timestamp of such visit
- - **status**: Visit's new status
- - **metadata**: Data associated to the visit
- - **snapshot** (Optional[sha1_git]): identifier of the snapshot
- associated to the visit
- """
origin = self._origins.get(origin)
if not origin:
return
@@ -1656,7 +897,6 @@
return self._convert_visit(visit)
def _select_random_origin_visit_by_type(self, type: str) -> str:
- """Select randomly an origin visit """
while True:
url = random.choice(list(self._origin_visits.keys()))
random_origin_visits = self._origin_visits[url]
@@ -1664,14 +904,6 @@
return url
def origin_visit_get_random(self, type: str) -> Optional[Dict[str, Any]]:
- """Randomly select one successful origin visit with <type>
- made in the last 3 months.
-
- Returns:
- dict representing an origin visit, in the same format as
- `origin_visit_get`.
-
- """
url = self._select_random_origin_visit_by_type(type)
random_origin_visits = copy.deepcopy(self._origin_visits[url])
random_origin_visits.reverse()
@@ -1684,13 +916,6 @@
return None
def stat_counters(self):
- """compute statistics about the number of tuples in various tables
-
- Returns:
- dict: a dictionary mapping textual labels (e.g., content) to
- integer values (e.g., the number of tuples in table content)
-
- """
keys = (
'content',
'directory',
@@ -1710,20 +935,9 @@
return stats
def refresh_stat_counters(self):
- """Recomputes the statistics for `stat_counters`."""
pass
def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
-
- Args:
- origin_url (str): the origin url for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider: id of the provider of metadata (ex:'hal')
- tool: id of the tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
- """
if not isinstance(origin_url, str):
raise TypeError('origin_id must be str, not %r' % (origin_url,))
@@ -1741,25 +955,6 @@
return None
def origin_metadata_get_by(self, origin_url, provider_type=None):
- """Retrieve list of all origin_metadata entries for the origin_url
-
- Args:
- origin_url (str): the origin's url
- provider_type (str): (optional) type of provider
-
- Returns:
- list of dicts: the origin_metadata dictionary with the keys:
-
- - origin_url (int): origin's URL
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
-
- """
if not isinstance(origin_url, str):
raise TypeError('origin_url must be str, not %r' % (origin_url,))
metadata = []
@@ -1773,23 +968,6 @@
return metadata
def tool_add(self, tools):
- """Add new tools to the storage.
-
- Args:
- tools (iterable of :class:`dict`): Tool information to add to
- storage. Each tool is a :class:`dict` with the following keys:
-
- - name (:class:`str`): name of the tool
- - version (:class:`str`): version of the tool
- - configuration (:class:`dict`): configuration of the tool,
- must be json-encodable
-
- Returns:
- :class:`dict`: All the tools inserted in storage
- (including the internal ``id``). The order of the list is not
- guaranteed to match the order of the initial list.
-
- """
inserted = []
for tool in tools:
key = self._tool_key(tool)
@@ -1803,32 +981,10 @@
return inserted
def tool_get(self, tool):
- """Retrieve tool information.
-
- Args:
- tool (dict): Tool information we want to retrieve from storage.
- The dicts have the same keys as those used in :func:`tool_add`.
-
- Returns:
- dict: The full tool information if it exists (``id`` included),
- None otherwise.
-
- """
return self._tools.get(self._tool_key(tool))
def metadata_provider_add(self, provider_name, provider_type, provider_url,
metadata):
- """Add a metadata provider.
-
- Args:
- provider_name (str): Its name
- provider_type (str): Its type
- provider_url (str): Its URL
- metadata: JSON-encodable object
-
- Returns:
- an identifier of the provider
- """
provider = {
'provider_name': provider_name,
'provider_type': provider_type,
@@ -1841,28 +997,9 @@
return key
def metadata_provider_get(self, provider_id):
- """Get a metadata provider
-
- Args:
- provider_id: Its identifier, as given by `metadata_provider_add`.
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
- """
return self._metadata_providers.get(provider_id)
def metadata_provider_get_by(self, provider):
- """Get a metadata provider
-
- Args:
- provider_name: Its name
- provider_url: Its URL
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
- """
key = self._metadata_provider_key(provider)
return self._metadata_providers.get(key)
@@ -1873,14 +1010,6 @@
raise TypeError('origin must be a string.')
def _person_add(self, person):
- """Add a person in storage.
-
- Note: Private method, do not use outside of this class.
-
- Args:
- person: dictionary with keys fullname, name and email.
-
- """
key = ('person', person.fullname)
if key not in self._objects:
person_id = len(self._persons) + 1
@@ -1913,3 +1042,12 @@
@staticmethod
def _metadata_provider_key(provider):
return '%r %r' % (provider['provider_name'], provider['provider_url'])
+
+ def diff_directories(self, from_dir, to_dir, track_renaming=False):
+ raise NotImplementedError('InMemoryStorage.diff_directories')
+
+ def diff_revisions(self, from_rev, to_rev, track_renaming=False):
+ raise NotImplementedError('InMemoryStorage.diff_revisions')
+
+ def diff_revision(self, revision, track_renaming=False):
+ raise NotImplementedError('InMemoryStorage.diff_revision')
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/interface.py
@@ -0,0 +1,1224 @@
+# Copyright (C) 2015-2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, Dict, List, Optional
+
+from swh.core.api import remote_api_endpoint
+
+
+class StorageInterface:
+ @remote_api_endpoint('check_config')
+ def check_config(self, *, check_write):
+ """Check that the storage is configured and ready to go."""
+ ...
+
+ @remote_api_endpoint('content/add')
+ def content_add(self, content):
+ """Add content blobs to the storage
+
+ Args:
+ contents (iterable): iterable of dictionaries representing
+ individual pieces of content to add. Each dictionary has the
+ following keys:
+
+ - data (bytes): the actual content
+ - length (int): content length (default: -1)
+ - one key for each checksum algorithm in
+ :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
+ corresponding checksum
+ - status (str): one of visible, hidden, absent
+ - reason (str): if status = absent, the reason why
+ - origin (int): if status = absent, the origin we saw the
+ content in
+
+ Raises:
+
+ The following exceptions can occur:
+
+ - HashCollision in case of collision
+ - Any other exceptions raise by the db
+
+ In case of errors, some of the content may have been stored in
+ the DB and in the objstorage.
+ Since additions to both idempotent, that should not be a problem.
+
+ Returns:
+ Summary dict with the following key and associated values:
+
+ content:add: New contents added
+ content:add:bytes: Sum of the contents' length data
+ skipped_content:add: New skipped contents (no data) added
+ """
+ ...
+
+ @remote_api_endpoint('content/update')
+ def content_update(self, content, keys=[]):
+ """Update content blobs to the storage. Does nothing for unknown
+ contents or skipped ones.
+
+ Args:
+ content (iterable): iterable of dictionaries representing
+ individual pieces of content to update. Each dictionary has the
+ following keys:
+
+ - data (bytes): the actual content
+ - length (int): content length (default: -1)
+ - one key for each checksum algorithm in
+ :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
+ corresponding checksum
+ - status (str): one of visible, hidden, absent
+
+ keys (list): List of keys (str) whose values needs an update, e.g.,
+ new hash column
+
+ """
+ ...
+
+ @remote_api_endpoint('content/add_metadata')
+ def content_add_metadata(self, content):
+ """Add content metadata to the storage (like `content_add`, but
+ without inserting to the objstorage).
+
+ Args:
+ content (iterable): iterable of dictionaries representing
+ individual pieces of content to add. Each dictionary has the
+ following keys:
+
+ - length (int): content length (default: -1)
+ - one key for each checksum algorithm in
+ :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
+ corresponding checksum
+ - status (str): one of visible, hidden, absent
+ - reason (str): if status = absent, the reason why
+ - origin (int): if status = absent, the origin we saw the
+ content in
+ - ctime (datetime): time of insertion in the archive
+
+ Returns:
+ Summary dict with the following key and associated values:
+
+ content:add: New contents added
+ skipped_content:add: New skipped contents (no data) added
+ """
+ ...
+
+ @remote_api_endpoint('content/data')
+ def content_get(self, content):
+ """Retrieve in bulk contents and their data.
+
+ This generator yields exactly as many items than provided sha1
+ identifiers, but callers should not assume this will always be true.
+
+ It may also yield `None` values in case an object was not found.
+
+ Args:
+ content: iterables of sha1
+
+ Yields:
+ Dict[str, bytes]: Generates streams of contents as dict with their
+ raw data:
+
+ - sha1 (bytes): content id
+ - data (bytes): content's raw data
+
+ Raises:
+ ValueError in case of too much contents are required.
+ cf. BULK_BLOCK_CONTENT_LEN_MAX
+
+ """
+ ...
+
+ @remote_api_endpoint('content/range')
+ def content_get_range(self, start, end, limit=1000):
+ """Retrieve contents within range [start, end] bound by limit.
+
+ Note that this function may return more than one blob per hash. The
+ limit is enforced with multiplicity (ie. two blobs with the same hash
+ will count twice toward the limit).
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **limit** (int): Limit result (default to 1000)
+
+ Returns:
+ a dict with keys:
+ - contents [dict]: iterable of contents in between the range.
+ - next (bytes): There remains content in the range
+ starting from this next sha1
+
+ """
+ ...
+
+ @remote_api_endpoint('content/partition')
+ def content_get_partition(
+ self, partition_id: int, nb_partitions: int, limit: int = 1000,
+ page_token: str = None):
+ """Splits contents into nb_partitions, and returns one of these based on
+ partition_id (which must be in [0, nb_partitions-1])
+
+ There is no guarantee on how the partitioning is done, or the
+ result order.
+
+ Args:
+ partition_id (int): index of the partition to fetch
+ nb_partitions (int): total number of partitions to split into
+ limit (int): Limit result (default to 1000)
+ page_token (Optional[str]): opaque token used for pagination.
+
+ Returns:
+ a dict with keys:
+ - contents (List[dict]): iterable of contents in the partition.
+ - **next_page_token** (Optional[str]): opaque token to be used as
+ `page_token` for retrieving the next page. if absent, there is
+ no more pages to gather.
+ """
+ ...
+
+ @remote_api_endpoint('content/metadata')
+ def content_get_metadata(
+ self, contents: List[bytes]) -> Dict[bytes, List[Dict]]:
+ """Retrieve content metadata in bulk
+
+ Args:
+ content: iterable of content identifiers (sha1)
+
+ Returns:
+ a dict with keys the content's sha1 and the associated value
+ either the existing content's metadata or None if the content does
+ not exist.
+
+ """
+ ...
+
+ @remote_api_endpoint('content/missing')
+ def content_missing(self, content, key_hash='sha1'):
+ """List content missing from storage
+
+ Args:
+ content ([dict]): iterable of dictionaries whose keys are
+ either 'length' or an item of
+ :data:`swh.model.hashutil.ALGORITHMS`;
+ mapped to the corresponding checksum
+ (or length).
+
+ key_hash (str): name of the column to use as hash id
+ result (default: 'sha1')
+
+ Returns:
+ iterable ([bytes]): missing content ids (as per the
+ key_hash column)
+
+ Raises:
+ TODO: an exception when we get a hash collision.
+
+ """
+ ...
+
+ @remote_api_endpoint('content/missing/sha1')
+ def content_missing_per_sha1(self, contents):
+ """List content missing from storage based only on sha1.
+
+ Args:
+ contents: Iterable of sha1 to check for absence.
+
+ Returns:
+ iterable: missing ids
+
+ Raises:
+ TODO: an exception when we get a hash collision.
+
+ """
+ ...
+
+ @remote_api_endpoint('content/missing/sha1_git')
+ def content_missing_per_sha1_git(self, contents):
+ """List content missing from storage based only on sha1_git.
+
+ Args:
+ contents (Iterable): An iterable of content id (sha1_git)
+
+ Yields:
+ missing contents sha1_git
+ """
+ ...
+
+ @remote_api_endpoint('content/skipped/missing')
+ def skipped_content_missing(self, contents):
+ """List skipped_content missing from storage
+
+ Args:
+ content: iterable of dictionaries containing the data for each
+ checksum algorithm.
+
+ Returns:
+ iterable: missing signatures
+
+ """
+ ...
+
+ @remote_api_endpoint('content/present')
+ def content_find(self, content):
+ """Find a content hash in db.
+
+ Args:
+ content: a dictionary representing one content hash, mapping
+ checksum algorithm names (see swh.model.hashutil.ALGORITHMS) to
+ checksum values
+
+ Returns:
+ a triplet (sha1, sha1_git, sha256) if the content exist
+ or None otherwise.
+
+ Raises:
+ ValueError: in case the key of the dictionary is not sha1, sha1_git
+ nor sha256.
+
+ """
+ ...
+
+ @remote_api_endpoint('content/get_random')
+ def content_get_random(self):
+ """Finds a random content id.
+
+ Returns:
+ a sha1_git
+ """
+ ...
+
+ @remote_api_endpoint('directory/add')
+ def directory_add(self, directories):
+ """Add directories to the storage
+
+ Args:
+ directories (iterable): iterable of dictionaries representing the
+ individual directories to add. Each dict has the following
+ keys:
+
+ - id (sha1_git): the id of the directory to add
+ - entries (list): list of dicts for each entry in the
+ directory. Each dict has the following keys:
+
+ - name (bytes)
+ - type (one of 'file', 'dir', 'rev'): type of the
+ directory entry (file, directory, revision)
+ - target (sha1_git): id of the object pointed at by the
+ directory entry
+ - perms (int): entry permissions
+
+ Returns:
+ Summary dict of keys with associated count as values:
+
+ directory:add: Number of directories actually added
+
+ """
+ ...
+
+ @remote_api_endpoint('directory/missing')
+ def directory_missing(self, directories):
+ """List directories missing from storage
+
+ Args:
+ directories (iterable): an iterable of directory ids
+
+ Yields:
+ missing directory ids
+
+ """
+ ...
+
+ @remote_api_endpoint('directory/ls')
+ def directory_ls(self, directory, recursive=False):
+ """Get entries for one directory.
+
+ Args:
+ - directory: the directory to list entries from.
+ - recursive: if flag on, this list recursively from this directory.
+
+ Returns:
+ List of entries for such directory.
+
+ If `recursive=True`, names in the path of a dir/file not at the
+ root are concatenated with a slash (`/`).
+
+ """
+ ...
+
+ @remote_api_endpoint('directory/path')
+ def directory_entry_get_by_path(self, directory, paths):
+ """Get the directory entry (either file or dir) from directory with path.
+
+ Args:
+ - directory: sha1 of the top level directory
+ - paths: path to lookup from the top level directory. From left
+ (top) to right (bottom).
+
+ Returns:
+ The corresponding directory entry if found, None otherwise.
+
+ """
+ ...
+
+ @remote_api_endpoint('directory/get_random')
+ def directory_get_random(self):
+ """Finds a random directory id.
+
+ Returns:
+ a sha1_git
+ """
+ ...
+
+ @remote_api_endpoint('revision/add')
+ def revision_add(self, revisions):
+ """Add revisions to the storage
+
+ Args:
+ revisions (Iterable[dict]): iterable of dictionaries representing
+ the individual revisions to add. Each dict has the following
+ keys:
+
+ - **id** (:class:`sha1_git`): id of the revision to add
+ - **date** (:class:`dict`): date the revision was written
+ - **committer_date** (:class:`dict`): date the revision got
+ added to the origin
+ - **type** (one of 'git', 'tar'): type of the
+ revision added
+ - **directory** (:class:`sha1_git`): the directory the
+ revision points at
+ - **message** (:class:`bytes`): the message associated with
+ the revision
+ - **author** (:class:`Dict[str, bytes]`): dictionary with
+ keys: name, fullname, email
+ - **committer** (:class:`Dict[str, bytes]`): dictionary with
+ keys: name, fullname, email
+ - **metadata** (:class:`jsonb`): extra information as
+ dictionary
+ - **synthetic** (:class:`bool`): revision's nature (tarball,
+ directory creates synthetic revision`)
+ - **parents** (:class:`list[sha1_git]`): the parents of
+ this revision
+
+ date dictionaries have the form defined in :mod:`swh.model`.
+
+ Returns:
+ Summary dict of keys with associated count as values
+
+ revision:add: New objects actually stored in db
+
+ """
+ ...
+
+ @remote_api_endpoint('revision/missing')
+ def revision_missing(self, revisions):
+ """List revisions missing from storage
+
+ Args:
+ revisions (iterable): revision ids
+
+ Yields:
+ missing revision ids
+
+ """
+ ...
+
+ @remote_api_endpoint('revision')
+ def revision_get(self, revisions):
+ """Get all revisions from storage
+
+ Args:
+ revisions: an iterable of revision ids
+
+ Returns:
+ iterable: an iterable of revisions as dictionaries (or None if the
+ revision doesn't exist)
+
+ """
+ ...
+
+ @remote_api_endpoint('revision/log')
+ def revision_log(self, revisions, limit=None):
+ """Fetch revision entry from the given root revisions.
+
+ Args:
+ revisions: array of root revision to lookup
+ limit: limitation on the output result. Default to None.
+
+ Yields:
+ List of revision log from such revisions root.
+
+ """
+ ...
+
+ @remote_api_endpoint('revision/shortlog')
+ def revision_shortlog(self, revisions, limit=None):
+ """Fetch the shortlog for the given revisions
+
+ Args:
+ revisions: list of root revisions to lookup
+ limit: depth limitation for the output
+
+ Yields:
+ a list of (id, parents) tuples.
+
+ """
+ ...
+
+ @remote_api_endpoint('revision/get_random')
+ def revision_get_random(self):
+ """Finds a random revision id.
+
+ Returns:
+ a sha1_git
+ """
+ ...
+
+ @remote_api_endpoint('release/add')
+ def release_add(self, releases):
+ """Add releases to the storage
+
+ Args:
+ releases (Iterable[dict]): iterable of dictionaries representing
+ the individual releases to add. Each dict has the following
+ keys:
+
+ - **id** (:class:`sha1_git`): id of the release to add
+ - **revision** (:class:`sha1_git`): id of the revision the
+ release points to
+ - **date** (:class:`dict`): the date the release was made
+ - **name** (:class:`bytes`): the name of the release
+ - **comment** (:class:`bytes`): the comment associated with
+ the release
+ - **author** (:class:`Dict[str, bytes]`): dictionary with
+ keys: name, fullname, email
+
+ the date dictionary has the form defined in :mod:`swh.model`.
+
+ Returns:
+ Summary dict of keys with associated count as values
+
+ release:add: New objects contents actually stored in db
+
+ """
+ ...
+
+ @remote_api_endpoint('release/missing')
+ def release_missing(self, releases):
+ """List releases missing from storage
+
+ Args:
+ releases: an iterable of release ids
+
+ Returns:
+ a list of missing release ids
+
+ """
+ ...
+
+ @remote_api_endpoint('release')
+ def release_get(self, releases):
+ """Given a list of sha1, return the releases's information
+
+ Args:
+ releases: list of sha1s
+
+ Yields:
+ dicts with the same keys as those given to `release_add`
+ (or ``None`` if a release does not exist)
+
+ """
+ ...
+
+ @remote_api_endpoint('release/get_random')
+ def release_get_random(self):
+ """Finds a random release id.
+
+ Returns:
+ a sha1_git
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/add')
+ def snapshot_add(self, snapshots):
+ """Add snapshots to the storage.
+
+ Args:
+ snapshot ([dict]): the snapshots to add, containing the
+ following keys:
+
+ - **id** (:class:`bytes`): id of the snapshot
+ - **branches** (:class:`dict`): branches the snapshot contains,
+ mapping the branch name (:class:`bytes`) to the branch target,
+ itself a :class:`dict` (or ``None`` if the branch points to an
+ unknown object)
+
+ - **target_type** (:class:`str`): one of ``content``,
+ ``directory``, ``revision``, ``release``,
+ ``snapshot``, ``alias``
+ - **target** (:class:`bytes`): identifier of the target
+ (currently a ``sha1_git`` for all object kinds, or the name
+ of the target branch for aliases)
+
+ Raises:
+ ValueError: if the origin or visit id does not exist.
+
+ Returns:
+
+ Summary dict of keys with associated count as values
+
+ snapshot:add: Count of object actually stored in db
+
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/missing')
+ def snapshot_missing(self, snapshots):
+ """List snapshots missing from storage
+
+ Args:
+ snapshots (iterable): an iterable of snapshot ids
+
+ Yields:
+ missing snapshot ids
+
+ """
+ ...
+
+ @remote_api_endpoint('snapshot')
+ def snapshot_get(self, snapshot_id):
+ """Get the content, possibly partial, of a snapshot with the given id
+
+ The branches of the snapshot are iterated in the lexicographical
+ order of their names.
+
+ .. warning:: At most 1000 branches contained in the snapshot will be
+ returned for performance reasons. In order to browse the whole
+ set of branches, the method :meth:`snapshot_get_branches`
+ should be used instead.
+
+ Args:
+ snapshot_id (bytes): identifier of the snapshot
+ Returns:
+ dict: a dict with three keys:
+ * **id**: identifier of the snapshot
+ * **branches**: a dict of branches contained in the snapshot
+ whose keys are the branches' names.
+ * **next_branch**: the name of the first branch not returned
+ or :const:`None` if the snapshot has less than 1000
+ branches.
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/by_origin_visit')
+ def snapshot_get_by_origin_visit(self, origin, visit):
+ """Get the content, possibly partial, of a snapshot for the given origin visit
+
+ The branches of the snapshot are iterated in the lexicographical
+ order of their names.
+
+ .. warning:: At most 1000 branches contained in the snapshot will be
+ returned for performance reasons. In order to browse the whole
+ set of branches, the method :meth:`snapshot_get_branches`
+ should be used instead.
+
+ Args:
+ origin (int): the origin identifier
+ visit (int): the visit identifier
+ Returns:
+ dict: None if the snapshot does not exist;
+ a dict with three keys otherwise:
+ * **id**: identifier of the snapshot
+ * **branches**: a dict of branches contained in the snapshot
+ whose keys are the branches' names.
+ * **next_branch**: the name of the first branch not returned
+ or :const:`None` if the snapshot has less than 1000
+ branches.
+
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/latest')
+ def snapshot_get_latest(self, origin, allowed_statuses=None):
+ """Get the content, possibly partial, of the latest snapshot for the
+ given origin, optionally only from visits that have one of the given
+ allowed_statuses
+
+ The branches of the snapshot are iterated in the lexicographical
+ order of their names.
+
+ .. warning:: At most 1000 branches contained in the snapshot will be
+ returned for performance reasons. In order to browse the whole
+ set of branches, the method :meth:`snapshot_get_branches`
+ should be used instead.
+
+ Args:
+ origin (str): the origin's URL
+ allowed_statuses (list of str): list of visit statuses considered
+ to find the latest snapshot for the visit. For instance,
+ ``allowed_statuses=['full']`` will only consider visits that
+ have successfully run to completion.
+ Returns:
+ dict: a dict with three keys:
+ * **id**: identifier of the snapshot
+ * **branches**: a dict of branches contained in the snapshot
+ whose keys are the branches' names.
+ * **next_branch**: the name of the first branch not returned
+ or :const:`None` if the snapshot has less than 1000
+ branches.
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/count_branches')
+ def snapshot_count_branches(self, snapshot_id):
+ """Count the number of branches in the snapshot with the given id
+
+ Args:
+ snapshot_id (bytes): identifier of the snapshot
+
+ Returns:
+ dict: A dict whose keys are the target types of branches and
+ values their corresponding amount
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/get_branches')
+ def snapshot_get_branches(self, snapshot_id, branches_from=b'',
+ branches_count=1000, target_types=None):
+ """Get the content, possibly partial, of a snapshot with the given id
+
+ The branches of the snapshot are iterated in the lexicographical
+ order of their names.
+
+ Args:
+ snapshot_id (bytes): identifier of the snapshot
+ branches_from (bytes): optional parameter used to skip branches
+ whose name is lesser than it before returning them
+ branches_count (int): optional parameter used to restrain
+ the amount of returned branches
+ target_types (list): optional parameter used to filter the
+ target types of branch to return (possible values that can be
+ contained in that list are `'content', 'directory',
+ 'revision', 'release', 'snapshot', 'alias'`)
+ Returns:
+ dict: None if the snapshot does not exist;
+ a dict with three keys otherwise:
+ * **id**: identifier of the snapshot
+ * **branches**: a dict of branches contained in the snapshot
+ whose keys are the branches' names.
+ * **next_branch**: the name of the first branch not returned
+ or :const:`None` if the snapshot has less than
+ `branches_count` branches after `branches_from` included.
+ """
+ ...
+
+ @remote_api_endpoint('snapshot/get_random')
+ def snapshot_get_random(self):
+ """Finds a random snapshot id.
+
+ Returns:
+ a sha1_git
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/add')
+ def origin_visit_add(self, origin, date, type):
+ """Add an origin_visit for the origin at ts with status 'ongoing'.
+
+ Args:
+ origin (str): visited origin's identifier or URL
+ date (Union[str,datetime]): timestamp of such visit
+ type (str): the type of loader used for the visit (hg, git, ...)
+
+ Returns:
+ dict: dictionary with keys origin and visit where:
+
+ - origin: origin identifier
+ - visit: the visit identifier for the new visit occurrence
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/update')
+ def origin_visit_update(self, origin, visit_id, status=None,
+ metadata=None, snapshot=None):
+ """Update an origin_visit's status.
+
+ Args:
+ origin (str): visited origin's URL
+ visit_id: Visit's id
+ status: Visit's new status
+ metadata: Data associated to the visit
+ snapshot (sha1_git): identifier of the snapshot to add to
+ the visit
+
+ Returns:
+ None
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/upsert')
+ def origin_visit_upsert(self, visits):
+ """Add a origin_visits with a specific id and with all its data.
+ If there is already an origin_visit with the same
+ `(origin_id, visit_id)`, overwrites it.
+
+ Args:
+ visits: iterable of dicts with keys:
+
+ - **origin**: dict with keys either `id` or `url`
+ - **visit**: origin visit id
+ - **date**: timestamp of such visit
+ - **status**: Visit's new status
+ - **metadata**: Data associated to the visit
+ - **snapshot**: identifier of the snapshot to add to
+ the visit
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/get')
+ def origin_visit_get(self, origin, last_visit=None, limit=None):
+ """Retrieve all the origin's visit's information.
+
+ Args:
+ origin (str): The visited origin
+ last_visit: Starting point from which listing the next visits
+ Default to None
+ limit (int): Number of results to return from the last visit.
+ Default to None
+
+ Yields:
+ List of visits.
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/find_by_date')
+ def origin_visit_find_by_date(self, origin, visit_date):
+ """Retrieves the origin visit whose date is closest to the provided
+ timestamp.
+ In case of a tie, the visit with largest id is selected.
+
+ Args:
+ origin (str): The occurrence's origin (URL).
+ target (datetime): target timestamp
+
+ Returns:
+ A visit.
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/getby')
+ def origin_visit_get_by(self, origin, visit):
+ """Retrieve origin visit's information.
+
+ Args:
+ origin: The occurrence's origin (identifier).
+
+ Returns:
+ The information on that particular (origin, visit) or None if
+ it does not exist
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/get_latest')
+ def origin_visit_get_latest(
+ self, origin, allowed_statuses=None, require_snapshot=False):
+ """Get the latest origin visit for the given origin, optionally
+ looking only for those with one of the given allowed_statuses
+ or for those with a known snapshot.
+
+ Args:
+ origin (str): the origin's URL
+ allowed_statuses (list of str): list of visit statuses considered
+ to find the latest visit. For instance,
+ ``allowed_statuses=['full']`` will only consider visits that
+ have successfully run to completion.
+ require_snapshot (bool): If True, only a visit with a snapshot
+ will be returned.
+
+ Returns:
+ dict: a dict with the following keys:
+
+ - **origin**: the URL of the origin
+ - **visit**: origin visit id
+ - **type**: type of loader used for the visit
+ - **date**: timestamp of such visit
+ - **status**: Visit's new status
+ - **metadata**: Data associated to the visit
+ - **snapshot** (Optional[sha1_git]): identifier of the snapshot
+ associated to the visit
+ """
+ ...
+
+ @remote_api_endpoint('origin/visit/get_random')
+ def origin_visit_get_random(
+ self, type: str) -> Optional[Dict[str, Any]]:
+ """Randomly select one successful origin visit with <type>
+ made in the last 3 months.
+
+ Returns:
+ dict representing an origin visit, in the same format as
+ :py:meth:`origin_visit_get`.
+
+ """
+ ...
+
+ @remote_api_endpoint('object/find_by_sha1_git')
+ def object_find_by_sha1_git(self, ids):
+ """Return the objects found with the given ids.
+
+ Args:
+ ids: a generator of sha1_gits
+
+ Returns:
+ dict: a mapping from id to the list of objects found. Each object
+ found is itself a dict with keys:
+
+ - sha1_git: the input id
+ - type: the type of object found
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/get')
+ def origin_get(self, origins):
+ """Return origins, either all identified by their ids or all
+ identified by tuples (type, url).
+
+ If the url is given and the type is omitted, one of the origins with
+ that url is returned.
+
+ Args:
+ origin: a list of dictionaries representing the individual
+ origins to find.
+ These dicts have the key url:
+
+ - url (bytes): the url the origin points to
+
+ Returns:
+ dict: the origin dictionary with the keys:
+
+ - id: origin's id
+ - url: origin's url
+
+ Raises:
+ ValueError: if the url or the id don't exist.
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/get_sha1')
+ def origin_get_by_sha1(self, sha1s):
+ """Return origins, identified by the sha1 of their URLs.
+
+ Args:
+ sha1s (list[bytes]): a list of sha1s
+
+ Yields:
+ dicts containing origin information as returned
+ by :meth:`swh.storage.storage.Storage.origin_get`, or None if an
+ origin matching the sha1 is not found.
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/get_range')
+ def origin_get_range(self, origin_from=1, origin_count=100):
+ """Retrieve ``origin_count`` origins whose ids are greater
+ or equal than ``origin_from``.
+
+ Origins are sorted by id before retrieving them.
+
+ Args:
+ origin_from (int): the minimum id of origins to retrieve
+ origin_count (int): the maximum number of origins to retrieve
+
+ Yields:
+ dicts containing origin information as returned
+ by :meth:`swh.storage.storage.Storage.origin_get`.
+ """
+ ...
+
+ @remote_api_endpoint('origin/list')
+ def origin_list(
+ self, page_token: Optional[str] = None, limit: int = 100) -> dict:
+ """Returns the list of origins
+
+ Args:
+ page_token: opaque token used for pagination.
+ limit: the maximum number of results to return
+
+ Returns:
+ dict: dict with the following keys:
+ - **next_page_token** (str, optional): opaque token to be used as
+ `page_token` for retrieving the next page. if absent, there is
+ no more pages to gather.
+ - **origins** (List[dict]): list of origins, as returned by
+ `origin_get`.
+ """
+ ...
+
+ @remote_api_endpoint('origin/search')
+ def origin_search(self, url_pattern, offset=0, limit=50,
+ regexp=False, with_visit=False):
+ """Search for origins whose urls contain a provided string pattern
+ or match a provided regular expression.
+ The search is performed in a case insensitive way.
+
+ Args:
+ url_pattern (str): the string pattern to search for in origin urls
+ offset (int): number of found origins to skip before returning
+ results
+ limit (int): the maximum number of found origins to return
+ regexp (bool): if True, consider the provided pattern as a regular
+ expression and return origins whose urls match it
+ with_visit (bool): if True, filter out origins with no visit
+
+ Yields:
+ dicts containing origin information as returned
+ by :meth:`swh.storage.storage.Storage.origin_get`.
+ """
+ ...
+
+ @remote_api_endpoint('origin/count')
+ def origin_count(self, url_pattern, regexp=False,
+ with_visit=False):
+ """Count origins whose urls contain a provided string pattern
+ or match a provided regular expression.
+ The pattern search in origin urls is performed in a case insensitive
+ way.
+
+ Args:
+ url_pattern (str): the string pattern to search for in origin urls
+ regexp (bool): if True, consider the provided pattern as a regular
+ expression and return origins whose urls match it
+ with_visit (bool): if True, filter out origins with no visit
+
+ Returns:
+ int: The number of origins matching the search criterion.
+ """
+ ...
+
+ @remote_api_endpoint('origin/add_multi')
+ def origin_add(self, origins):
+ """Add origins to the storage
+
+ Args:
+ origins: list of dictionaries representing the individual origins,
+ with the following keys:
+
+ - type: the origin type ('git', 'svn', 'deb', ...)
+ - url (bytes): the url the origin points to
+
+ Returns:
+ list: given origins as dict updated with their id
+
+ """
+ ...
+
+ @remote_api_endpoint('origin/add')
+ def origin_add_one(self, origin):
+ """Add origin to the storage
+
+ Args:
+ origin: dictionary representing the individual origin to add. This
+ dict has the following keys:
+
+ - type (FIXME: enum TBD): the origin type ('git', 'wget', ...)
+ - url (bytes): the url the origin points to
+
+ Returns:
+ the id of the added origin, or of the identical one that already
+ exists.
+
+ """
+ ...
+
+ def stat_counters(self):
+ """compute statistics about the number of tuples in various tables
+
+ Returns:
+ dict: a dictionary mapping textual labels (e.g., content) to
+ integer values (e.g., the number of tuples in table content)
+
+ """
+ ...
+
+ def refresh_stat_counters(self):
+ """Recomputes the statistics for `stat_counters`."""
+ ...
+
+ @remote_api_endpoint('origin/metadata/add')
+ def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
+ """ Add an origin_metadata for the origin at ts with provenance and
+ metadata.
+
+ Args:
+ origin_url (str): the origin url for which the metadata is added
+ ts (datetime): timestamp of the found metadata
+ provider (int): the provider of metadata (ex:'hal')
+ tool (int): tool used to extract metadata
+ metadata (jsonb): the metadata retrieved at the time and location
+ """
+ ...
+
+ @remote_api_endpoint('origin/metadata/get')
+ def origin_metadata_get_by(self, origin_url, provider_type=None):
+ """Retrieve list of all origin_metadata entries for the origin_id
+
+ Args:
+ origin_url (str): the origin's URL
+ provider_type (str): (optional) type of provider
+
+ Returns:
+ list of dicts: the origin_metadata dictionary with the keys:
+
+ - origin_id (int): origin's id
+ - discovery_date (datetime): timestamp of discovery
+ - tool_id (int): metadata's extracting tool
+ - metadata (jsonb)
+ - provider_id (int): metadata's provider
+ - provider_name (str)
+ - provider_type (str)
+ - provider_url (str)
+
+ """
+ ...
+
+ @remote_api_endpoint('tool/add')
+ def tool_add(self, tools):
+ """Add new tools to the storage.
+
+ Args:
+ tools (iterable of :class:`dict`): Tool information to add to
+ storage. Each tool is a :class:`dict` with the following keys:
+
+ - name (:class:`str`): name of the tool
+ - version (:class:`str`): version of the tool
+ - configuration (:class:`dict`): configuration of the tool,
+ must be json-encodable
+
+ Returns:
+ :class:`dict`: All the tools inserted in storage
+ (including the internal ``id``). The order of the list is not
+ guaranteed to match the order of the initial list.
+
+ """
+ ...
+
+ @remote_api_endpoint('tool/data')
+ def tool_get(self, tool):
+ """Retrieve tool information.
+
+ Args:
+ tool (dict): Tool information we want to retrieve from storage.
+ The dicts have the same keys as those used in :func:`tool_add`.
+
+ Returns:
+ dict: The full tool information if it exists (``id`` included),
+ None otherwise.
+
+ """
+ ...
+
+ @remote_api_endpoint('provider/add')
+ def metadata_provider_add(self, provider_name, provider_type, provider_url,
+ metadata):
+ """Add a metadata provider.
+
+ Args:
+ provider_name (str): Its name
+ provider_type (str): Its type (eg. `'deposit-client'`)
+ provider_url (str): Its URL
+ metadata: JSON-encodable object
+
+ Returns:
+ int: an identifier of the provider
+ """
+ ...
+
+ @remote_api_endpoint('provider/get')
+ def metadata_provider_get(self, provider_id):
+ """Get a metadata provider
+
+ Args:
+ provider_id: Its identifier, as given by `metadata_provider_add`.
+
+ Returns:
+ dict: same as `metadata_provider_add`;
+ or None if it does not exist.
+ """
+ ...
+
+ @remote_api_endpoint('provider/getby')
+ def metadata_provider_get_by(self, provider):
+ """Get a metadata provider
+
+ Args:
+ provider (dict): A dictionary with keys:
+ * provider_name: Its name
+ * provider_url: Its URL
+
+ Returns:
+ dict: same as `metadata_provider_add`;
+ or None if it does not exist.
+ """
+ ...
+
+ @remote_api_endpoint('algos/diff_directories')
+ def diff_directories(self, from_dir, to_dir, track_renaming=False):
+ """Compute the list of file changes introduced between two arbitrary
+ directories (insertion / deletion / modification / renaming of files).
+
+ Args:
+ from_dir (bytes): identifier of the directory to compare from
+ to_dir (bytes): identifier of the directory to compare to
+ track_renaming (bool): whether or not to track files renaming
+
+ Returns:
+ A list of dict describing the introduced file changes
+ (see :func:`swh.storage.algos.diff.diff_directories`
+ for more details).
+ """
+ ...
+
+ @remote_api_endpoint('algos/diff_revisions')
+ def diff_revisions(self, from_rev, to_rev, track_renaming=False):
+ """Compute the list of file changes introduced between two arbitrary
+ revisions (insertion / deletion / modification / renaming of files).
+
+ Args:
+ from_rev (bytes): identifier of the revision to compare from
+ to_rev (bytes): identifier of the revision to compare to
+ track_renaming (bool): whether or not to track files renaming
+
+ Returns:
+ A list of dict describing the introduced file changes
+ (see :func:`swh.storage.algos.diff.diff_directories`
+ for more details).
+ """
+ ...
+
+ @remote_api_endpoint('algos/diff_revision')
+ def diff_revision(self, revision, track_renaming=False):
+ """Compute the list of file changes introduced by a specific revision
+ (insertion / deletion / modification / renaming of files) by comparing
+ it against its first parent.
+
+ Args:
+ revision (bytes): identifier of the revision from which to
+ compute the list of files changes
+ track_renaming (bool): whether or not to track files renaming
+
+ Returns:
+ A list of dict describing the introduced file changes
+ (see :func:`swh.storage.algos.diff.diff_directories`
+ for more details).
+ """
+ ...
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -17,7 +17,6 @@
import psycopg2
import psycopg2.pool
-from swh.core.api import remote_api_endpoint
from swh.model.model import SHA1_SIZE
from swh.model.hashutil import ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.objstorage import get_objstorage
@@ -99,11 +98,9 @@
if db:
self.put_db(db)
- @remote_api_endpoint('check_config')
@timed
@db_transaction()
def check_config(self, *, check_write, db=None, cur=None):
- """Check that the storage is configured and ready to go."""
if not self.objstorage.check_config(check_write=check_write):
return False
@@ -232,48 +229,10 @@
# move metadata in place
db.skipped_content_add_from_temp(cur)
- @remote_api_endpoint('content/add')
@timed
@process_metrics
@db_transaction()
def content_add(self, content, db=None, cur=None):
- """Add content blobs to the storage
-
- Note: in case of DB errors, objects might have already been added to
- the object storage and will not be removed. Since addition to the
- object storage is idempotent, that should not be a problem.
-
- Args:
- contents (iterable): iterable of dictionaries representing
- individual pieces of content to add. Each dictionary has the
- following keys:
-
- - data (bytes): the actual content
- - length (int): content length (default: -1)
- - one key for each checksum algorithm in
- :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
- corresponding checksum
- - status (str): one of visible, hidden, absent
- - reason (str): if status = absent, the reason why
- - origin (int): if status = absent, the origin we saw the
- content in
-
- Raises:
-
- In case of errors, nothing is stored in the db (in the
- objstorage, it could though). The following exceptions can
- occur:
-
- - HashCollision in case of collision
- - Any other exceptions raise by the db
-
- Returns:
- Summary dict with the following key and associated values:
-
- content:add: New contents added
- content:add:bytes: Sum of the contents' length data
- skipped_content:add: New skipped contents (no data) added
- """
content = [dict(c.items()) for c in content] # semi-shallow copy
now = datetime.datetime.now(tz=datetime.timezone.utc)
for item in content:
@@ -329,35 +288,15 @@
summary['content:add:bytes'] = content_bytes_added
return summary
- @remote_api_endpoint('content/update')
@timed
@db_transaction()
def content_update(self, content, keys=[], db=None, cur=None):
- """Update content blobs to the storage. Does nothing for unknown
- contents or skipped ones.
-
- Args:
- content (iterable): iterable of dictionaries representing
- individual pieces of content to update. Each dictionary has the
- following keys:
-
- - data (bytes): the actual content
- - length (int): content length (default: -1)
- - one key for each checksum algorithm in
- :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
- corresponding checksum
- - status (str): one of visible, hidden, absent
-
- keys (list): List of keys (str) whose values needs an update, e.g.,
- new hash column
-
- """
# TODO: Add a check on input keys. How to properly implement
# this? We don't know yet the new columns.
if self.journal_writer:
raise NotImplementedError(
- 'content_update is not yet support with a journal_writer.')
+ 'content_update is not yet supported with a journal_writer.')
db.mktemp('content', cur)
select_keys = list(set(db.content_get_metadata_keys).union(set(keys)))
@@ -365,36 +304,10 @@
db.content_update_from_temp(keys_to_update=keys,
cur=cur)
- @remote_api_endpoint('content/add_metadata')
@timed
@process_metrics
@db_transaction()
def content_add_metadata(self, content, db=None, cur=None):
- """Add content metadata to the storage (like `content_add`, but
- without inserting to the objstorage).
-
- Args:
- content (iterable): iterable of dictionaries representing
- individual pieces of content to add. Each dictionary has the
- following keys:
-
- - length (int): content length (default: -1)
- - one key for each checksum algorithm in
- :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
- corresponding checksum
- - status (str): one of visible, hidden, absent
- - reason (str): if status = absent, the reason why
- - origin (int): if status = absent, the origin we saw the
- content in
- - ctime (datetime): time of insertion in the archive
-
- Returns:
- Summary dict with the following key and associated values:
-
- content:add: New contents added
- skipped_content:add: New skipped contents (no data) added
- """
-
content = [self._normalize_content(c) for c in content]
for c in content:
self._validate_content(c)
@@ -413,31 +326,8 @@
return summary
- @remote_api_endpoint('content/data')
@timed
def content_get(self, content):
- """Retrieve in bulk contents and their data.
-
- This generator yields exactly as many items than provided sha1
- identifiers, but callers should not assume this will always be true.
-
- It may also yield `None` values in case an object was not found.
-
- Args:
- content: iterables of sha1
-
- Yields:
- Dict[str, bytes]: Generates streams of contents as dict with their
- raw data:
-
- - sha1 (bytes): content id
- - data (bytes): content's raw data
-
- Raises:
- ValueError in case of too much contents are required.
- cf. BULK_BLOCK_CONTENT_LEN_MAX
-
- """
# FIXME: Make this method support slicing the `data`.
if len(content) > BULK_BLOCK_CONTENT_LEN_MAX:
raise ValueError(
@@ -452,30 +342,9 @@
yield {'sha1': obj_id, 'data': data}
- @remote_api_endpoint('content/range')
@timed
@db_transaction()
def content_get_range(self, start, end, limit=1000, db=None, cur=None):
- """Retrieve contents within range [start, end] bound by limit.
-
- Note that this function may return more than one blob per hash. The
- limit is enforced with multiplicity (ie. two blobs with the same hash
- will count twice toward the limit).
-
- Args:
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **limit** (int): Limit result (default to 1000)
-
- Returns:
- a dict with keys:
- - contents [dict]: iterable of contents in between the range.
- - next (bytes): There remains content in the range
- starting from this next sha1
-
- """
if limit is None:
raise ValueError('Development error: limit should not be None')
contents = []
@@ -493,31 +362,11 @@
'next': next_content,
}
- @remote_api_endpoint('content/partition')
@timed
@db_transaction()
def content_get_partition(
self, partition_id: int, nb_partitions: int, limit: int = 1000,
page_token: str = None, db=None, cur=None):
- """Splits contents into nb_partitions, and returns one of these based on
- partition_id (which must be in [0, nb_partitions-1])
-
- There is no guarantee on how the partitioning is done, or the
- result order.
-
- Args:
- partition_id (int): index of the partition to fetch
- nb_partitions (int): total number of partitions to split into
- limit (int): Limit result (default to 1000)
- page_token (Optional[str]): opaque token used for pagination.
-
- Returns:
- a dict with keys:
- - contents (List[dict]): iterable of contents in the partition.
- - **next_page_token** (Optional[str]): opaque token to be used as
- `page_token` for retrieving the next page. if absent, there is
- no more pages to gather.
- """
if limit is None:
raise ValueError('Development error: limit should not be None')
(start, end) = get_partition_bounds_bytes(
@@ -535,53 +384,20 @@
result2['next_page_token'] = hash_to_hex(result['next'])
return result2
- @remote_api_endpoint('content/metadata')
@timed
@db_transaction(statement_timeout=500)
def content_get_metadata(
self, contents: List[bytes],
db=None, cur=None) -> Dict[bytes, List[Dict]]:
- """Retrieve content metadata in bulk
-
- Args:
- content: iterable of content identifiers (sha1)
-
- Returns:
- a dict with keys the content's sha1 and the associated value
- either the existing content's metadata or None if the content does
- not exist.
-
- """
result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents}
for row in db.content_get_metadata_from_sha1s(contents, cur):
content_meta = dict(zip(db.content_get_metadata_keys, row))
result[content_meta['sha1']].append(content_meta)
return result
- @remote_api_endpoint('content/missing')
@timed
@db_transaction_generator()
def content_missing(self, content, key_hash='sha1', db=None, cur=None):
- """List content missing from storage
-
- Args:
- content ([dict]): iterable of dictionaries whose keys are
- either 'length' or an item of
- :data:`swh.model.hashutil.ALGORITHMS`;
- mapped to the corresponding checksum
- (or length).
-
- key_hash (str): name of the column to use as hash id
- result (default: 'sha1')
-
- Returns:
- iterable ([bytes]): missing content ids (as per the
- key_hash column)
-
- Raises:
- TODO: an exception when we get a hash collision.
-
- """
keys = db.content_hash_keys
if key_hash not in keys:
@@ -595,77 +411,27 @@
for obj in db.content_missing_from_list(content, cur):
yield obj[key_hash_idx]
- @remote_api_endpoint('content/missing/sha1')
@timed
@db_transaction_generator()
def content_missing_per_sha1(self, contents, db=None, cur=None):
- """List content missing from storage based only on sha1.
-
- Args:
- contents: Iterable of sha1 to check for absence.
-
- Returns:
- iterable: missing ids
-
- Raises:
- TODO: an exception when we get a hash collision.
-
- """
for obj in db.content_missing_per_sha1(contents, cur):
yield obj[0]
- @remote_api_endpoint('content/missing/sha1_git')
@timed
@db_transaction_generator()
def content_missing_per_sha1_git(self, contents, db=None, cur=None):
- """List content missing from storage based only on sha1_git.
-
- Args:
- contents (Iterable): An iterable of content id (sha1_git)
-
- Yields:
- missing contents sha1_git
- """
for obj in db.content_missing_per_sha1_git(contents, cur):
yield obj[0]
- @remote_api_endpoint('content/skipped/missing')
@timed
@db_transaction_generator()
def skipped_content_missing(self, contents, db=None, cur=None):
- """List skipped_content missing from storage
-
- Args:
- content: iterable of dictionaries containing the data for each
- checksum algorithm.
-
- Returns:
- iterable: missing signatures
-
- """
for content in db.skipped_content_missing(contents, cur):
yield dict(zip(db.content_hash_keys, content))
- @remote_api_endpoint('content/present')
@timed
@db_transaction()
def content_find(self, content, db=None, cur=None):
- """Find a content hash in db.
-
- Args:
- content: a dictionary representing one content hash, mapping
- checksum algorithm names (see swh.model.hashutil.ALGORITHMS) to
- checksum values
-
- Returns:
- a triplet (sha1, sha1_git, sha256) if the content exist
- or None otherwise.
-
- Raises:
- ValueError: in case the key of the dictionary is not sha1, sha1_git
- nor sha256.
-
- """
if not set(content).intersection(ALGORITHMS):
raise ValueError('content keys must contain at least one of: '
'sha1, sha1_git, sha256, blake2s256')
@@ -678,46 +444,15 @@
return [dict(zip(db.content_find_cols, content))
for content in contents]
- @remote_api_endpoint('content/get_random')
@timed
@db_transaction()
def content_get_random(self, db=None, cur=None):
- """Finds a random content id.
-
- Returns:
- a sha1_git
- """
return db.content_get_random(cur)
- @remote_api_endpoint('directory/add')
@timed
@process_metrics
@db_transaction()
def directory_add(self, directories, db=None, cur=None):
- """Add directories to the storage
-
- Args:
- directories (iterable): iterable of dictionaries representing the
- individual directories to add. Each dict has the following
- keys:
-
- - id (sha1_git): the id of the directory to add
- - entries (list): list of dicts for each entry in the
- directory. Each dict has the following keys:
-
- - name (bytes)
- - type (one of 'file', 'dir', 'rev'): type of the
- directory entry (file, directory, revision)
- - target (sha1_git): id of the object pointed at by the
- directory entry
- - perms (int): entry permissions
-
- Returns:
- Summary dict of keys with associated count as values:
-
- directory:add: Number of directories actually added
-
- """
directories = list(directories)
summary = {'directory:add': 0}
@@ -778,39 +513,15 @@
return summary
- @remote_api_endpoint('directory/missing')
@timed
@db_transaction_generator()
def directory_missing(self, directories, db=None, cur=None):
- """List directories missing from storage
-
- Args:
- directories (iterable): an iterable of directory ids
-
- Yields:
- missing directory ids
-
- """
for obj in db.directory_missing_from_list(directories, cur):
yield obj[0]
- @remote_api_endpoint('directory/ls')
@timed
@db_transaction_generator(statement_timeout=20000)
def directory_ls(self, directory, recursive=False, db=None, cur=None):
- """Get entries for one directory.
-
- Args:
- - directory: the directory to list entries from.
- - recursive: if flag on, this list recursively from this directory.
-
- Returns:
- List of entries for such directory.
-
- If `recursive=True`, names in the path of a dir/file not at the
- root are concatenated with a slash (`/`).
-
- """
if recursive:
res_gen = db.directory_walk(directory, cur=cur)
else:
@@ -819,77 +530,22 @@
for line in res_gen:
yield dict(zip(db.directory_ls_cols, line))
- @remote_api_endpoint('directory/path')
@timed
@db_transaction(statement_timeout=2000)
def directory_entry_get_by_path(self, directory, paths, db=None, cur=None):
- """Get the directory entry (either file or dir) from directory with path.
-
- Args:
- - directory: sha1 of the top level directory
- - paths: path to lookup from the top level directory. From left
- (top) to right (bottom).
-
- Returns:
- The corresponding directory entry if found, None otherwise.
-
- """
res = db.directory_entry_get_by_path(directory, paths, cur)
if res:
return dict(zip(db.directory_ls_cols, res))
- @remote_api_endpoint('directory/get_random')
@timed
@db_transaction()
def directory_get_random(self, db=None, cur=None):
- """Finds a random directory id.
-
- Returns:
- a sha1_git
- """
return db.directory_get_random(cur)
- @remote_api_endpoint('revision/add')
@timed
@process_metrics
@db_transaction()
def revision_add(self, revisions, db=None, cur=None):
- """Add revisions to the storage
-
- Args:
- revisions (Iterable[dict]): iterable of dictionaries representing
- the individual revisions to add. Each dict has the following
- keys:
-
- - **id** (:class:`sha1_git`): id of the revision to add
- - **date** (:class:`dict`): date the revision was written
- - **committer_date** (:class:`dict`): date the revision got
- added to the origin
- - **type** (one of 'git', 'tar'): type of the
- revision added
- - **directory** (:class:`sha1_git`): the directory the
- revision points at
- - **message** (:class:`bytes`): the message associated with
- the revision
- - **author** (:class:`Dict[str, bytes]`): dictionary with
- keys: name, fullname, email
- - **committer** (:class:`Dict[str, bytes]`): dictionary with
- keys: name, fullname, email
- - **metadata** (:class:`jsonb`): extra information as
- dictionary
- - **synthetic** (:class:`bool`): revision's nature (tarball,
- directory creates synthetic revision`)
- - **parents** (:class:`list[sha1_git]`): the parents of
- this revision
-
- date dictionaries have the form defined in :mod:`swh.model`.
-
- Returns:
- Summary dict of keys with associated count as values
-
- revision:add: New objects actually stored in db
-
- """
revisions = list(revisions)
summary = {'revision:add': 0}
@@ -925,39 +581,18 @@
return {'revision:add': len(revisions_missing)}
- @remote_api_endpoint('revision/missing')
@timed
@db_transaction_generator()
def revision_missing(self, revisions, db=None, cur=None):
- """List revisions missing from storage
-
- Args:
- revisions (iterable): revision ids
-
- Yields:
- missing revision ids
-
- """
if not revisions:
return
for obj in db.revision_missing_from_list(revisions, cur):
yield obj[0]
- @remote_api_endpoint('revision')
@timed
@db_transaction_generator(statement_timeout=1000)
def revision_get(self, revisions, db=None, cur=None):
- """Get all revisions from storage
-
- Args:
- revisions: an iterable of revision ids
-
- Returns:
- iterable: an iterable of revisions as dictionaries (or None if the
- revision doesn't exist)
-
- """
for line in db.revision_get_from_list(revisions, cur):
data = converters.db_to_revision(
dict(zip(db.revision_get_cols, line))
@@ -967,20 +602,9 @@
continue
yield data
- @remote_api_endpoint('revision/log')
@timed
@db_transaction_generator(statement_timeout=2000)
def revision_log(self, revisions, limit=None, db=None, cur=None):
- """Fetch revision entry from the given root revisions.
-
- Args:
- revisions: array of root revision to lookup
- limit: limitation on the output result. Default to None.
-
- Yields:
- List of revision log from such revisions root.
-
- """
for line in db.revision_log(revisions, limit, cur):
data = converters.db_to_revision(
dict(zip(db.revision_get_cols, line))
@@ -990,64 +614,21 @@
continue
yield data
- @remote_api_endpoint('revision/shortlog')
@timed
@db_transaction_generator(statement_timeout=2000)
def revision_shortlog(self, revisions, limit=None, db=None, cur=None):
- """Fetch the shortlog for the given revisions
-
- Args:
- revisions: list of root revisions to lookup
- limit: depth limitation for the output
-
- Yields:
- a list of (id, parents) tuples.
-
- """
yield from db.revision_shortlog(revisions, limit, cur)
- @remote_api_endpoint('revision/get_random')
@timed
@db_transaction()
def revision_get_random(self, db=None, cur=None):
- """Finds a random revision id.
-
- Returns:
- a sha1_git
- """
return db.revision_get_random(cur)
- @remote_api_endpoint('release/add')
@timed
@process_metrics
@db_transaction()
def release_add(self, releases, db=None, cur=None):
- """Add releases to the storage
-
- Args:
- releases (Iterable[dict]): iterable of dictionaries representing
- the individual releases to add. Each dict has the following
- keys:
-
- - **id** (:class:`sha1_git`): id of the release to add
- - **revision** (:class:`sha1_git`): id of the revision the
- release points to
- - **date** (:class:`dict`): the date the release was made
- - **name** (:class:`bytes`): the name of the release
- - **comment** (:class:`bytes`): the comment associated with
- the release
- - **author** (:class:`Dict[str, bytes]`): dictionary with
- keys: name, fullname, email
-
- the date dictionary has the form defined in :mod:`swh.model`.
-
- Returns:
- Summary dict of keys with associated count as values
-
- release:add: New objects contents actually stored in db
-
- """
releases = list(releases)
summary = {'release:add': 0}
@@ -1079,90 +660,33 @@
return {'release:add': len(releases_missing)}
- @remote_api_endpoint('release/missing')
@timed
@db_transaction_generator()
def release_missing(self, releases, db=None, cur=None):
- """List releases missing from storage
-
- Args:
- releases: an iterable of release ids
-
- Returns:
- a list of missing release ids
-
- """
if not releases:
return
for obj in db.release_missing_from_list(releases, cur):
yield obj[0]
- @remote_api_endpoint('release')
@timed
@db_transaction_generator(statement_timeout=500)
def release_get(self, releases, db=None, cur=None):
- """Given a list of sha1, return the releases's information
-
- Args:
- releases: list of sha1s
-
- Yields:
- dicts with the same keys as those given to `release_add`
- (or ``None`` if a release does not exist)
-
- """
for release in db.release_get_from_list(releases, cur):
data = converters.db_to_release(
dict(zip(db.release_get_cols, release))
)
yield data if data['target_type'] else None
- @remote_api_endpoint('release/get_random')
@timed
@db_transaction()
def release_get_random(self, db=None, cur=None):
- """Finds a random release id.
-
- Returns:
- a sha1_git
- """
return db.release_get_random(cur)
- @remote_api_endpoint('snapshot/add')
@timed
@process_metrics
@db_transaction()
def snapshot_add(self, snapshots, db=None, cur=None):
- """Add snapshots to the storage.
-
- Args:
- snapshot ([dict]): the snapshots to add, containing the
- following keys:
-
- - **id** (:class:`bytes`): id of the snapshot
- - **branches** (:class:`dict`): branches the snapshot contains,
- mapping the branch name (:class:`bytes`) to the branch target,
- itself a :class:`dict` (or ``None`` if the branch points to an
- unknown object)
-
- - **target_type** (:class:`str`): one of ``content``,
- ``directory``, ``revision``, ``release``,
- ``snapshot``, ``alias``
- - **target** (:class:`bytes`): identifier of the target
- (currently a ``sha1_git`` for all object kinds, or the name
- of the target branch for aliases)
-
- Raises:
- ValueError: if the origin or visit id does not exist.
-
- Returns:
-
- Summary dict of keys with associated count as values
-
- snapshot:add: Count of object actually stored in db
-
- """
created_temp_table = False
count = 0
@@ -1195,78 +719,21 @@
return {'snapshot:add': count}
- @remote_api_endpoint('snapshot/missing')
@timed
@db_transaction_generator()
def snapshot_missing(self, snapshots, db=None, cur=None):
- """List snapshots missing from storage
-
- Args:
- snapshots (iterable): an iterable of snapshot ids
-
- Yields:
- missing snapshot ids
-
- """
for obj in db.snapshot_missing_from_list(snapshots, cur):
yield obj[0]
- @remote_api_endpoint('snapshot')
@timed
@db_transaction(statement_timeout=2000)
def snapshot_get(self, snapshot_id, db=None, cur=None):
- """Get the content, possibly partial, of a snapshot with the given id
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
-
- Args:
- snapshot_id (bytes): identifier of the snapshot
- Returns:
- dict: a dict with three keys:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
- """
return self.snapshot_get_branches(snapshot_id, db=db, cur=cur)
- @remote_api_endpoint('snapshot/by_origin_visit')
@timed
@db_transaction(statement_timeout=2000)
def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None):
- """Get the content, possibly partial, of a snapshot for the given origin visit
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
-
- Args:
- origin (int): the origin identifier
- visit (int): the visit identifier
- Returns:
- dict: None if the snapshot does not exist;
- a dict with three keys otherwise:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
-
- """
snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur)
if snapshot_id:
@@ -1274,38 +741,10 @@
return None
- @remote_api_endpoint('snapshot/latest')
@timed
@db_transaction(statement_timeout=4000)
def snapshot_get_latest(self, origin, allowed_statuses=None, db=None,
cur=None):
- """Get the content, possibly partial, of the latest snapshot for the
- given origin, optionally only from visits that have one of the given
- allowed_statuses
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
-
- Args:
- origin (str): the origin's URL
- allowed_statuses (list of str): list of visit statuses considered
- to find the latest snapshot for the visit. For instance,
- ``allowed_statuses=['full']`` will only consider visits that
- have successfully run to completion.
- Returns:
- dict: a dict with three keys:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
- """
if isinstance(origin, int):
origin = self.origin_get({'id': origin}, db=db, cur=cur)
if not origin:
@@ -1323,53 +762,17 @@
'last origin visit references an unknown snapshot')
return snapshot
- @remote_api_endpoint('snapshot/count_branches')
@timed
@db_transaction(statement_timeout=2000)
def snapshot_count_branches(self, snapshot_id, db=None, cur=None):
- """Count the number of branches in the snapshot with the given id
-
- Args:
- snapshot_id (bytes): identifier of the snapshot
-
- Returns:
- dict: A dict whose keys are the target types of branches and
- values their corresponding amount
- """
return dict([bc for bc in
db.snapshot_count_branches(snapshot_id, cur)])
- @remote_api_endpoint('snapshot/get_branches')
@timed
@db_transaction(statement_timeout=2000)
def snapshot_get_branches(self, snapshot_id, branches_from=b'',
branches_count=1000, target_types=None,
db=None, cur=None):
- """Get the content, possibly partial, of a snapshot with the given id
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- Args:
- snapshot_id (bytes): identifier of the snapshot
- branches_from (bytes): optional parameter used to skip branches
- whose name is lesser than it before returning them
- branches_count (int): optional parameter used to restrain
- the amount of returned branches
- target_types (list): optional parameter used to filter the
- target types of branch to return (possible values that can be
- contained in that list are `'content', 'directory',
- 'revision', 'release', 'snapshot', 'alias'`)
- Returns:
- dict: None if the snapshot does not exist;
- a dict with three keys otherwise:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than
- `branches_count` branches after `branches_from` included.
- """
if snapshot_id == EMPTY_SNAPSHOT_ID:
return {
'id': snapshot_id,
@@ -1406,36 +809,15 @@
return None
- @remote_api_endpoint('snapshot/get_random')
@timed
@db_transaction()
def snapshot_get_random(self, db=None, cur=None):
- """Finds a random snapshot id.
-
- Returns:
- a sha1_git
- """
return db.snapshot_get_random(cur)
- @remote_api_endpoint('origin/visit/add')
@timed
@db_transaction()
def origin_visit_add(self, origin, date, type,
db=None, cur=None):
- """Add an origin_visit for the origin at ts with status 'ongoing'.
-
- Args:
- origin (str): visited origin's identifier or URL
- date (Union[str,datetime]): timestamp of such visit
- type (str): the type of loader used for the visit (hg, git, ...)
-
- Returns:
- dict: dictionary with keys origin and visit where:
-
- - origin: origin identifier
- - visit: the visit identifier for the new visit occurrence
-
- """
origin_url = origin
if isinstance(date, str):
@@ -1458,26 +840,11 @@
'visit': visit_id,
}
- @remote_api_endpoint('origin/visit/update')
@timed
@db_transaction()
def origin_visit_update(self, origin, visit_id, status=None,
metadata=None, snapshot=None,
db=None, cur=None):
- """Update an origin_visit's status.
-
- Args:
- origin (str): visited origin's URL
- visit_id: Visit's id
- status: Visit's new status
- metadata: Data associated to the visit
- snapshot (sha1_git): identifier of the snapshot to add to
- the visit
-
- Returns:
- None
-
- """
if not isinstance(origin, str):
raise TypeError('origin must be a string, not %r' % (origin,))
origin_url = origin
@@ -1503,25 +870,9 @@
db.origin_visit_update(origin_url, visit_id, updates, cur)
- @remote_api_endpoint('origin/visit/upsert')
@timed
@db_transaction()
def origin_visit_upsert(self, visits, db=None, cur=None):
- """Add a origin_visits with a specific id and with all its data.
- If there is already an origin_visit with the same
- `(origin_id, visit_id)`, overwrites it.
-
- Args:
- visits: iterable of dicts with keys:
-
- - **origin**: dict with keys either `id` or `url`
- - **visit**: origin visit id
- - **date**: timestamp of such visit
- - **status**: Visit's new status
- - **metadata**: Data associated to the visit
- - **snapshot**: identifier of the snapshot to add to
- the visit
- """
visits = copy.deepcopy(visits)
for visit in visits:
if isinstance(visit['date'], str):
@@ -1538,142 +889,55 @@
# TODO: upsert them all in a single query
db.origin_visit_upsert(**visit, cur=cur)
- @remote_api_endpoint('origin/visit/get')
@timed
@db_transaction_generator(statement_timeout=500)
def origin_visit_get(self, origin, last_visit=None, limit=None, db=None,
cur=None):
- """Retrieve all the origin's visit's information.
-
- Args:
- origin (str): The visited origin
- last_visit: Starting point from which listing the next visits
- Default to None
- limit (int): Number of results to return from the last visit.
- Default to None
-
- Yields:
- List of visits.
-
- """
for line in db.origin_visit_get_all(
origin, last_visit=last_visit, limit=limit, cur=cur):
data = dict(zip(db.origin_visit_get_cols, line))
yield data
- @remote_api_endpoint('origin/visit/find_by_date')
@timed
@db_transaction(statement_timeout=500)
def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None):
- """Retrieves the origin visit whose date is closest to the provided
- timestamp.
- In case of a tie, the visit with largest id is selected.
-
- Args:
- origin (str): The occurrence's origin (URL).
- target (datetime): target timestamp
-
- Returns:
- A visit.
-
- """
line = db.origin_visit_find_by_date(origin, visit_date, cur=cur)
if line:
return dict(zip(db.origin_visit_get_cols, line))
- @remote_api_endpoint('origin/visit/getby')
@timed
@db_transaction(statement_timeout=500)
def origin_visit_get_by(self, origin, visit, db=None, cur=None):
- """Retrieve origin visit's information.
-
- Args:
- origin: The occurrence's origin (identifier).
-
- Returns:
- The information on that particular (origin, visit) or None if
- it does not exist
-
- """
ori_visit = db.origin_visit_get(origin, visit, cur)
if not ori_visit:
return None
return dict(zip(db.origin_visit_get_cols, ori_visit))
- @remote_api_endpoint('origin/visit/get_latest')
@timed
@db_transaction(statement_timeout=4000)
def origin_visit_get_latest(
self, origin, allowed_statuses=None, require_snapshot=False,
db=None, cur=None):
- """Get the latest origin visit for the given origin, optionally
- looking only for those with one of the given allowed_statuses
- or for those with a known snapshot.
-
- Args:
- origin (str): the origin's URL
- allowed_statuses (list of str): list of visit statuses considered
- to find the latest visit. For instance,
- ``allowed_statuses=['full']`` will only consider visits that
- have successfully run to completion.
- require_snapshot (bool): If True, only a visit with a snapshot
- will be returned.
-
- Returns:
- dict: a dict with the following keys:
-
- - **origin**: the URL of the origin
- - **visit**: origin visit id
- - **type**: type of loader used for the visit
- - **date**: timestamp of such visit
- - **status**: Visit's new status
- - **metadata**: Data associated to the visit
- - **snapshot** (Optional[sha1_git]): identifier of the snapshot
- associated to the visit
- """
origin_visit = db.origin_visit_get_latest(
origin, allowed_statuses=allowed_statuses,
require_snapshot=require_snapshot, cur=cur)
if origin_visit:
return dict(zip(db.origin_visit_get_cols, origin_visit))
- @remote_api_endpoint('origin/visit/get_random')
@timed
@db_transaction()
def origin_visit_get_random(
self, type: str, db=None, cur=None) -> Optional[Dict[str, Any]]:
- """Randomly select one successful origin visit with <type>
- made in the last 3 months.
-
- Returns:
- dict representing an origin visit, in the same format as
- :py:meth:`origin_visit_get`.
-
- """
result = db.origin_visit_get_random(type, cur)
if result:
return dict(zip(db.origin_visit_get_cols, result))
else:
return None
- @remote_api_endpoint('object/find_by_sha1_git')
@timed
@db_transaction(statement_timeout=2000)
def object_find_by_sha1_git(self, ids, db=None, cur=None):
- """Return the objects found with the given ids.
-
- Args:
- ids: a generator of sha1_gits
-
- Returns:
- dict: a mapping from id to the list of objects found. Each object
- found is itself a dict with keys:
-
- - sha1_git: the input id
- - type: the type of object found
-
- """
ret = {id: [] for id in ids}
for retval in db.object_find_by_sha1_git(ids, cur=cur):
@@ -1683,33 +947,9 @@
return ret
- @remote_api_endpoint('origin/get')
@timed
@db_transaction(statement_timeout=500)
def origin_get(self, origins, db=None, cur=None):
- """Return origins, either all identified by their ids or all
- identified by tuples (type, url).
-
- If the url is given and the type is omitted, one of the origins with
- that url is returned.
-
- Args:
- origin: a list of dictionaries representing the individual
- origins to find.
- These dicts have the key url:
-
- - url (bytes): the url the origin points to
-
- Returns:
- dict: the origin dictionary with the keys:
-
- - id: origin's id
- - url: origin's url
-
- Raises:
- ValueError: if the url or the id don't exist.
-
- """
if isinstance(origins, dict):
# Old API
return_single = True
@@ -1733,67 +973,26 @@
else:
return [None if res['url'] is None else res for res in results]
- @remote_api_endpoint('origin/get_sha1')
@timed
@db_transaction_generator(statement_timeout=500)
def origin_get_by_sha1(self, sha1s, db=None, cur=None):
- """Return origins, identified by the sha1 of their URLs.
-
- Args:
- sha1s (list[bytes]): a list of sha1s
-
- Yields:
- dicts containing origin information as returned
- by :meth:`swh.storage.storage.Storage.origin_get`, or None if an
- origin matching the sha1 is not found.
-
- """
for line in db.origin_get_by_sha1(sha1s, cur):
if line[0] is not None:
yield dict(zip(db.origin_cols, line))
else:
yield None
- @remote_api_endpoint('origin/get_range')
@timed
@db_transaction_generator()
def origin_get_range(self, origin_from=1, origin_count=100,
db=None, cur=None):
- """Retrieve ``origin_count`` origins whose ids are greater
- or equal than ``origin_from``.
-
- Origins are sorted by id before retrieving them.
-
- Args:
- origin_from (int): the minimum id of origins to retrieve
- origin_count (int): the maximum number of origins to retrieve
-
- Yields:
- dicts containing origin information as returned
- by :meth:`swh.storage.storage.Storage.origin_get`.
- """
for origin in db.origin_get_range(origin_from, origin_count, cur):
yield dict(zip(db.origin_get_range_cols, origin))
- @remote_api_endpoint('origin/list')
@timed
@db_transaction()
def origin_list(self, page_token: Optional[str] = None, limit: int = 100,
*, db=None, cur=None) -> dict:
- """Returns the list of origins
-
- Args:
- page_token: opaque token used for pagination.
- limit: the maximum number of results to return
-
- Returns:
- dict: dict with the following keys:
- - **next_page_token** (str, optional): opaque token to be used as
- `page_token` for retrieving the next page. if absent, there is
- no more pages to gather.
- - **origins** (List[dict]): list of origins, as returned by
- `origin_get`.
- """
page_token = page_token or '0'
if not isinstance(page_token, str):
raise TypeError('page_token must be a string.')
@@ -1814,70 +1013,23 @@
return result
- @remote_api_endpoint('origin/search')
@timed
@db_transaction_generator()
def origin_search(self, url_pattern, offset=0, limit=50,
regexp=False, with_visit=False, db=None, cur=None):
- """Search for origins whose urls contain a provided string pattern
- or match a provided regular expression.
- The search is performed in a case insensitive way.
-
- Args:
- url_pattern (str): the string pattern to search for in origin urls
- offset (int): number of found origins to skip before returning
- results
- limit (int): the maximum number of found origins to return
- regexp (bool): if True, consider the provided pattern as a regular
- expression and return origins whose urls match it
- with_visit (bool): if True, filter out origins with no visit
-
- Yields:
- dicts containing origin information as returned
- by :meth:`swh.storage.storage.Storage.origin_get`.
- """
for origin in db.origin_search(url_pattern, offset, limit,
regexp, with_visit, cur):
yield dict(zip(db.origin_cols, origin))
- @remote_api_endpoint('origin/count')
@timed
@db_transaction()
def origin_count(self, url_pattern, regexp=False,
with_visit=False, db=None, cur=None):
- """Count origins whose urls contain a provided string pattern
- or match a provided regular expression.
- The pattern search in origin urls is performed in a case insensitive
- way.
-
- Args:
- url_pattern (str): the string pattern to search for in origin urls
- regexp (bool): if True, consider the provided pattern as a regular
- expression and return origins whose urls match it
- with_visit (bool): if True, filter out origins with no visit
-
- Returns:
- int: The number of origins matching the search criterion.
- """
return db.origin_count(url_pattern, regexp, with_visit, cur)
- @remote_api_endpoint('origin/add_multi')
@timed
@db_transaction()
def origin_add(self, origins, db=None, cur=None):
- """Add origins to the storage
-
- Args:
- origins: list of dictionaries representing the individual origins,
- with the following keys:
-
- - type: the origin type ('git', 'svn', 'deb', ...)
- - url (bytes): the url the origin points to
-
- Returns:
- list: given origins as dict updated with their id
-
- """
origins = copy.deepcopy(list(origins))
for origin in origins:
self.origin_add_one(origin, db=db, cur=cur)
@@ -1885,24 +1037,9 @@
send_metric('origin:add', count=len(origins), method_name='origin_add')
return origins
- @remote_api_endpoint('origin/add')
@timed
@db_transaction()
def origin_add_one(self, origin, db=None, cur=None):
- """Add origin to the storage
-
- Args:
- origin: dictionary representing the individual origin to add. This
- dict has the following keys:
-
- - type (FIXME: enum TBD): the origin type ('git', 'wget', ...)
- - url (bytes): the url the origin points to
-
- Returns:
- the id of the added origin, or of the identical one that already
- exists.
-
- """
origin_row = list(db.origin_get_by_url([origin['url']], cur))[0]
origin_url = dict(zip(db.origin_cols, origin_row))['url']
if origin_url:
@@ -1917,18 +1054,10 @@
@db_transaction(statement_timeout=500)
def stat_counters(self, db=None, cur=None):
- """compute statistics about the number of tuples in various tables
-
- Returns:
- dict: a dictionary mapping textual labels (e.g., content) to
- integer values (e.g., the number of tuples in table content)
-
- """
return {k: v for (k, v) in db.stat_counters()}
@db_transaction()
def refresh_stat_counters(self, db=None, cur=None):
- """Recomputes the statistics for `stat_counters`."""
keys = [
'content',
'directory',
@@ -1947,21 +1076,10 @@
for key in keys:
cur.execute('select * from swh_update_counter(%s)', (key,))
- @remote_api_endpoint('origin/metadata/add')
@timed
@db_transaction()
def origin_metadata_add(self, origin_url, ts, provider, tool, metadata,
db=None, cur=None):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
-
- Args:
- origin_url (str): the origin url for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider (int): the provider of metadata (ex:'hal')
- tool (int): tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
- """
if isinstance(ts, str):
ts = dateutil.parser.parse(ts)
@@ -1970,54 +1088,16 @@
send_metric(
'origin_metadata:add', count=1, method_name='origin_metadata_add')
- @remote_api_endpoint('origin/metadata/get')
@timed
@db_transaction_generator(statement_timeout=500)
def origin_metadata_get_by(self, origin_url, provider_type=None, db=None,
cur=None):
- """Retrieve list of all origin_metadata entries for the origin_id
-
- Args:
- origin_url (str): the origin's URL
- provider_type (str): (optional) type of provider
-
- Returns:
- list of dicts: the origin_metadata dictionary with the keys:
-
- - origin_id (int): origin's id
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
-
- """
for line in db.origin_metadata_get_by(origin_url, provider_type, cur):
yield dict(zip(db.origin_metadata_get_cols, line))
- @remote_api_endpoint('tool/add')
@timed
@db_transaction()
def tool_add(self, tools, db=None, cur=None):
- """Add new tools to the storage.
-
- Args:
- tools (iterable of :class:`dict`): Tool information to add to
- storage. Each tool is a :class:`dict` with the following keys:
-
- - name (:class:`str`): name of the tool
- - version (:class:`str`): version of the tool
- - configuration (:class:`dict`): configuration of the tool,
- must be json-encodable
-
- Returns:
- :class:`dict`: All the tools inserted in storage
- (including the internal ``id``). The order of the list is not
- guaranteed to match the order of the initial list.
-
- """
db.mktemp_tool(cur)
db.copy_to(tools, 'tmp_tool',
['name', 'version', 'configuration'],
@@ -2028,21 +1108,9 @@
send_metric('tool:add', count=len(results), method_name='tool_add')
return results
- @remote_api_endpoint('tool/data')
@timed
@db_transaction(statement_timeout=500)
def tool_get(self, tool, db=None, cur=None):
- """Retrieve tool information.
-
- Args:
- tool (dict): Tool information we want to retrieve from storage.
- The dicts have the same keys as those used in :func:`tool_add`.
-
- Returns:
- dict: The full tool information if it exists (``id`` included),
- None otherwise.
-
- """
tool_conf = tool['configuration']
if isinstance(tool_conf, dict):
tool_conf = json.dumps(tool_conf)
@@ -2054,118 +1122,41 @@
return None
return dict(zip(db.tool_cols, idx))
- @remote_api_endpoint('provider/add')
@timed
@db_transaction()
def metadata_provider_add(self, provider_name, provider_type, provider_url,
metadata, db=None, cur=None):
- """Add a metadata provider.
-
- Args:
- provider_name (str): Its name
- provider_type (str): Its type (eg. `'deposit-client'`)
- provider_url (str): Its URL
- metadata: JSON-encodable object
-
- Returns:
- int: an identifier of the provider
- """
result = db.metadata_provider_add(provider_name, provider_type,
provider_url, metadata, cur)
send_metric(
'metadata_provider:add', count=1, method_name='metadata_provider')
return result
- @remote_api_endpoint('provider/get')
@timed
@db_transaction()
def metadata_provider_get(self, provider_id, db=None, cur=None):
- """Get a metadata provider
-
- Args:
- provider_id: Its identifier, as given by `metadata_provider_add`.
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
- """
result = db.metadata_provider_get(provider_id)
if not result:
return None
return dict(zip(db.metadata_provider_cols, result))
- @remote_api_endpoint('provider/getby')
@timed
@db_transaction()
def metadata_provider_get_by(self, provider, db=None, cur=None):
- """Get a metadata provider
-
- Args:
- provider (dict): A dictionary with keys:
- * provider_name: Its name
- * provider_url: Its URL
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
- """
result = db.metadata_provider_get_by(provider['provider_name'],
provider['provider_url'])
if not result:
return None
return dict(zip(db.metadata_provider_cols, result))
- @remote_api_endpoint('algos/diff_directories')
@timed
def diff_directories(self, from_dir, to_dir, track_renaming=False):
- """Compute the list of file changes introduced between two arbitrary
- directories (insertion / deletion / modification / renaming of files).
-
- Args:
- from_dir (bytes): identifier of the directory to compare from
- to_dir (bytes): identifier of the directory to compare to
- track_renaming (bool): whether or not to track files renaming
-
- Returns:
- A list of dict describing the introduced file changes
- (see :func:`swh.storage.algos.diff.diff_directories`
- for more details).
- """
return diff.diff_directories(self, from_dir, to_dir, track_renaming)
- @remote_api_endpoint('algos/diff_revisions')
@timed
def diff_revisions(self, from_rev, to_rev, track_renaming=False):
- """Compute the list of file changes introduced between two arbitrary
- revisions (insertion / deletion / modification / renaming of files).
-
- Args:
- from_rev (bytes): identifier of the revision to compare from
- to_rev (bytes): identifier of the revision to compare to
- track_renaming (bool): whether or not to track files renaming
-
- Returns:
- A list of dict describing the introduced file changes
- (see :func:`swh.storage.algos.diff.diff_directories`
- for more details).
- """
return diff.diff_revisions(self, from_rev, to_rev, track_renaming)
- @remote_api_endpoint('algos/diff_revision')
@timed
def diff_revision(self, revision, track_renaming=False):
- """Compute the list of file changes introduced by a specific revision
- (insertion / deletion / modification / renaming of files) by comparing
- it against its first parent.
-
- Args:
- revision (bytes): identifier of the revision from which to
- compute the list of files changes
- track_renaming (bool): whether or not to track files renaming
-
- Returns:
- A list of dict describing the introduced file changes
- (see :func:`swh.storage.algos.diff.diff_directories`
- for more details).
- """
return diff.diff_revision(self, revision, track_renaming)
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -6,6 +6,7 @@
import copy
from contextlib import contextmanager
import datetime
+import inspect
import itertools
import math
import queue
@@ -28,6 +29,7 @@
from swh.model.hypothesis_strategies import objects
from swh.storage import HashCollision
from swh.storage.converters import origin_url_to_sha1 as sha1
+from swh.storage.interface import StorageInterface
from .storage_data import data
@@ -95,6 +97,34 @@
"""
maxDiff = None # type: ClassVar[Optional[int]]
+ def test_types(self, swh_storage):
+ """Checks all methods of StorageInterface are implemented by this
+ backend, and that they have the same signature."""
+ # Create an instance of the protocol (which cannot be instantiated
+ # directly, so this creates a subclass, then instantiates it)
+ interface = type('_', (StorageInterface,), {})()
+
+ assert 'content_add' in dir(interface)
+
+ missing_methods = []
+
+ for meth_name in dir(interface):
+ if meth_name.startswith('_'):
+ continue
+ interface_meth = getattr(interface, meth_name)
+ try:
+ concrete_meth = getattr(swh_storage, meth_name)
+ except AttributeError:
+ missing_methods.append(meth_name)
+ continue
+
+ expected_signature = inspect.signature(interface_meth)
+ actual_signature = inspect.signature(concrete_meth)
+
+ assert expected_signature == actual_signature, meth_name
+
+ assert missing_methods == []
+
def test_check_config(self, swh_storage):
assert swh_storage.check_config(check_write=True)
assert swh_storage.check_config(check_write=False)

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 7:19 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215723

Event Timeline