Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/storage.py
Show All 11 Lines | |||||
from concurrent.futures import ThreadPoolExecutor | from concurrent.futures import ThreadPoolExecutor | ||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
from typing import Any, Dict, Mapping | from typing import Any, Dict, Mapping | ||||
import dateutil.parser | import dateutil.parser | ||||
import psycopg2 | import psycopg2 | ||||
import psycopg2.pool | import psycopg2.pool | ||||
from . import converters | from swh.core.api import remote_api_endpoint | ||||
from .common import db_transaction_generator, db_transaction | |||||
from .db import Db | |||||
from .exc import StorageDBError | |||||
from .algos import diff | |||||
from swh.model.hashutil import ALGORITHMS, hash_to_bytes | from swh.model.hashutil import ALGORITHMS, hash_to_bytes | ||||
from swh.objstorage import get_objstorage | from swh.objstorage import get_objstorage | ||||
from swh.objstorage.exc import ObjNotFoundError | from swh.objstorage.exc import ObjNotFoundError | ||||
try: | try: | ||||
from swh.journal.writer import get_journal_writer | from swh.journal.writer import get_journal_writer | ||||
except ImportError: | except ImportError: | ||||
get_journal_writer = None # type: ignore | get_journal_writer = None # type: ignore | ||||
# mypy limitation, see https://github.com/python/mypy/issues/1153 | # mypy limitation, see https://github.com/python/mypy/issues/1153 | ||||
from . import converters | |||||
from .common import db_transaction_generator, db_transaction | |||||
from .db import Db | |||||
from .exc import StorageDBError | |||||
from .algos import diff | |||||
# Max block size of contents to return | # Max block size of contents to return | ||||
BULK_BLOCK_CONTENT_LEN_MAX = 10000 | BULK_BLOCK_CONTENT_LEN_MAX = 10000 | ||||
EMPTY_SNAPSHOT_ID = hash_to_bytes('1a8893e6a86f444e8be8e7bda6cb34fb1735a00e') | EMPTY_SNAPSHOT_ID = hash_to_bytes('1a8893e6a86f444e8be8e7bda6cb34fb1735a00e') | ||||
"""Identifier for the empty snapshot""" | """Identifier for the empty snapshot""" | ||||
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines | def db(self): | ||||
db = None | db = None | ||||
try: | try: | ||||
db = self.get_db() | db = self.get_db() | ||||
yield db | yield db | ||||
finally: | finally: | ||||
if db: | if db: | ||||
self.put_db(db) | self.put_db(db) | ||||
@remote_api_endpoint('check_config') | |||||
@db_transaction() | @db_transaction() | ||||
def check_config(self, *, check_write, db, cur): | def check_config(self, *, check_write, db=None, cur=None): | ||||
"""Check that the storage is configured and ready to go.""" | """Check that the storage is configured and ready to go.""" | ||||
if not self.objstorage.check_config(check_write=check_write): | if not self.objstorage.check_config(check_write=check_write): | ||||
return False | return False | ||||
# Check permissions on one of the tables | # Check permissions on one of the tables | ||||
if check_write: | if check_write: | ||||
check = 'INSERT' | check = 'INSERT' | ||||
Show All 37 Lines | def _validate_content(d): | ||||
if d['status'] != 'absent' and d.get('reason') is not None: | if d['status'] != 'absent' and d.get('reason') is not None: | ||||
raise ValueError( | raise ValueError( | ||||
'Must not provide a reason if content is not absent.') | 'Must not provide a reason if content is not absent.') | ||||
if d['length'] < -1: | if d['length'] < -1: | ||||
raise ValueError('Content length must be positive or -1.') | raise ValueError('Content length must be positive or -1.') | ||||
def _filter_new_content(self, content, db, cur): | def _filter_new_content(self, content, db=None, cur=None): | ||||
"""Sort contents into buckets 'with data' and 'without data', | """Sort contents into buckets 'with data' and 'without data', | ||||
and filter out those already in the database.""" | and filter out those already in the database.""" | ||||
content_by_status = defaultdict(list) | content_by_status = defaultdict(list) | ||||
for d in content: | for d in content: | ||||
content_by_status[d['status']].append(d) | content_by_status[d['status']].append(d) | ||||
content_with_data = content_by_status['visible'] \ | content_with_data = content_by_status['visible'] \ | ||||
+ content_by_status['hidden'] | + content_by_status['hidden'] | ||||
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | def _content_add_metadata(self, db, cur, | ||||
cont['origin'] = origin_id | cont['origin'] = origin_id | ||||
db.mktemp('skipped_content', cur) | db.mktemp('skipped_content', cur) | ||||
db.copy_to(content_without_data, 'tmp_skipped_content', | db.copy_to(content_without_data, 'tmp_skipped_content', | ||||
db.skipped_content_keys, cur) | db.skipped_content_keys, cur) | ||||
# move metadata in place | # move metadata in place | ||||
db.skipped_content_add_from_temp(cur) | db.skipped_content_add_from_temp(cur) | ||||
@remote_api_endpoint('content/add') | |||||
@db_transaction() | @db_transaction() | ||||
def content_add(self, content, db, cur): | def content_add(self, content, db=None, cur=None): | ||||
"""Add content blobs to the storage | """Add content blobs to the storage | ||||
Note: in case of DB errors, objects might have already been added to | Note: in case of DB errors, objects might have already been added to | ||||
the object storage and will not be removed. Since addition to the | the object storage and will not be removed. Since addition to the | ||||
object storage is idempotent, that should not be a problem. | object storage is idempotent, that should not be a problem. | ||||
Args: | Args: | ||||
contents (iterable): iterable of dictionaries representing | contents (iterable): iterable of dictionaries representing | ||||
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines | def content_add(self, content, db=None, cur=None): | ||||
# Wait for objstorage addition before returning from the | # Wait for objstorage addition before returning from the | ||||
# transaction, bubbling up any exception | # transaction, bubbling up any exception | ||||
content_bytes_added = added_to_objstorage.result() | content_bytes_added = added_to_objstorage.result() | ||||
summary['content:add:bytes'] = content_bytes_added | summary['content:add:bytes'] = content_bytes_added | ||||
return summary | return summary | ||||
@remote_api_endpoint('content/update') | |||||
@db_transaction() | @db_transaction() | ||||
def content_update(self, content, keys=[], db=None, cur=None): | def content_update(self, content, keys=[], db=None, cur=None): | ||||
"""Update content blobs to the storage. Does nothing for unknown | """Update content blobs to the storage. Does nothing for unknown | ||||
contents or skipped ones. | contents or skipped ones. | ||||
Args: | Args: | ||||
content (iterable): iterable of dictionaries representing | content (iterable): iterable of dictionaries representing | ||||
individual pieces of content to update. Each dictionary has the | individual pieces of content to update. Each dictionary has the | ||||
Show All 18 Lines | def content_update(self, content, keys=[], db=None, cur=None): | ||||
'content_update is not yet support with a journal_writer.') | 'content_update is not yet support with a journal_writer.') | ||||
db.mktemp('content', cur) | db.mktemp('content', cur) | ||||
select_keys = list(set(db.content_get_metadata_keys).union(set(keys))) | select_keys = list(set(db.content_get_metadata_keys).union(set(keys))) | ||||
db.copy_to(content, 'tmp_content', select_keys, cur) | db.copy_to(content, 'tmp_content', select_keys, cur) | ||||
db.content_update_from_temp(keys_to_update=keys, | db.content_update_from_temp(keys_to_update=keys, | ||||
cur=cur) | cur=cur) | ||||
@remote_api_endpoint('content/add_metadata') | |||||
@db_transaction() | @db_transaction() | ||||
def content_add_metadata(self, content, db, cur): | def content_add_metadata(self, content, db=None, cur=None): | ||||
"""Add content metadata to the storage (like `content_add`, but | """Add content metadata to the storage (like `content_add`, but | ||||
without inserting to the objstorage). | without inserting to the objstorage). | ||||
Args: | Args: | ||||
content (iterable): iterable of dictionaries representing | content (iterable): iterable of dictionaries representing | ||||
individual pieces of content to add. Each dictionary has the | individual pieces of content to add. Each dictionary has the | ||||
following keys: | following keys: | ||||
Show All 27 Lines | def content_add_metadata(self, content, db=None, cur=None): | ||||
assert 'data' not in content | assert 'data' not in content | ||||
self.journal_writer.write_addition('content', item) | self.journal_writer.write_addition('content', item) | ||||
self._content_add_metadata( | self._content_add_metadata( | ||||
db, cur, content_with_data, content_without_data) | db, cur, content_with_data, content_without_data) | ||||
return summary | return summary | ||||
@remote_api_endpoint('content/data') | |||||
def content_get(self, content): | def content_get(self, content): | ||||
"""Retrieve in bulk contents and their data. | """Retrieve in bulk contents and their data. | ||||
This generator yields exactly as many items than provided sha1 | This generator yields exactly as many items than provided sha1 | ||||
identifiers, but callers should not assume this will always be true. | identifiers, but callers should not assume this will always be true. | ||||
It may also yield `None` values in case an object was not found. | It may also yield `None` values in case an object was not found. | ||||
Show All 21 Lines | def content_get(self, content): | ||||
try: | try: | ||||
data = self.objstorage.get(obj_id) | data = self.objstorage.get(obj_id) | ||||
except ObjNotFoundError: | except ObjNotFoundError: | ||||
yield None | yield None | ||||
continue | continue | ||||
yield {'sha1': obj_id, 'data': data} | yield {'sha1': obj_id, 'data': data} | ||||
@remote_api_endpoint('content/range') | |||||
@db_transaction() | @db_transaction() | ||||
def content_get_range(self, start, end, limit=1000, db=None, cur=None): | def content_get_range(self, start, end, limit=1000, db=None, cur=None): | ||||
"""Retrieve contents within range [start, end] bound by limit. | """Retrieve contents within range [start, end] bound by limit. | ||||
Note that this function may return more than one blob per hash. The | Note that this function may return more than one blob per hash. The | ||||
limit is enforced with multiplicity (ie. two blobs with the same hash | limit is enforced with multiplicity (ie. two blobs with the same hash | ||||
will count twice toward the limit). | will count twice toward the limit). | ||||
Show All 23 Lines | def content_get_range(self, start, end, limit=1000, db=None, cur=None): | ||||
next_content = content['sha1'] | next_content = content['sha1'] | ||||
break | break | ||||
contents.append(content) | contents.append(content) | ||||
return { | return { | ||||
'contents': contents, | 'contents': contents, | ||||
'next': next_content, | 'next': next_content, | ||||
} | } | ||||
@remote_api_endpoint('content/metadata') | |||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def content_get_metadata(self, content, db=None, cur=None): | def content_get_metadata(self, content, db=None, cur=None): | ||||
"""Retrieve content metadata in bulk | """Retrieve content metadata in bulk | ||||
Args: | Args: | ||||
content: iterable of content identifiers (sha1) | content: iterable of content identifiers (sha1) | ||||
Returns: | Returns: | ||||
an iterable with content metadata corresponding to the given ids | an iterable with content metadata corresponding to the given ids | ||||
""" | """ | ||||
for metadata in db.content_get_metadata_from_sha1s(content, cur): | for metadata in db.content_get_metadata_from_sha1s(content, cur): | ||||
yield dict(zip(db.content_get_metadata_keys, metadata)) | yield dict(zip(db.content_get_metadata_keys, metadata)) | ||||
@remote_api_endpoint('content/missing') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def content_missing(self, content, key_hash='sha1', db=None, cur=None): | def content_missing(self, content, key_hash='sha1', db=None, cur=None): | ||||
"""List content missing from storage | """List content missing from storage | ||||
Args: | Args: | ||||
content ([dict]): iterable of dictionaries whose keys are | content ([dict]): iterable of dictionaries whose keys are | ||||
either 'length' or an item of | either 'length' or an item of | ||||
:data:`swh.model.hashutil.ALGORITHMS`; | :data:`swh.model.hashutil.ALGORITHMS`; | ||||
Show All 19 Lines | def content_missing(self, content, key_hash='sha1', db=None, cur=None): | ||||
key_hash_idx = keys.index(key_hash) | key_hash_idx = keys.index(key_hash) | ||||
if not content: | if not content: | ||||
return | return | ||||
for obj in db.content_missing_from_list(content, cur): | for obj in db.content_missing_from_list(content, cur): | ||||
yield obj[key_hash_idx] | yield obj[key_hash_idx] | ||||
@remote_api_endpoint('content/missing/sha1') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def content_missing_per_sha1(self, contents, db=None, cur=None): | def content_missing_per_sha1(self, contents, db=None, cur=None): | ||||
"""List content missing from storage based only on sha1. | """List content missing from storage based only on sha1. | ||||
Args: | Args: | ||||
contents: Iterable of sha1 to check for absence. | contents: Iterable of sha1 to check for absence. | ||||
Returns: | Returns: | ||||
iterable: missing ids | iterable: missing ids | ||||
Raises: | Raises: | ||||
TODO: an exception when we get a hash collision. | TODO: an exception when we get a hash collision. | ||||
""" | """ | ||||
for obj in db.content_missing_per_sha1(contents, cur): | for obj in db.content_missing_per_sha1(contents, cur): | ||||
yield obj[0] | yield obj[0] | ||||
@remote_api_endpoint('content/skipped/missing') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def skipped_content_missing(self, contents, db=None, cur=None): | def skipped_content_missing(self, contents, db=None, cur=None): | ||||
"""List skipped_content missing from storage | """List skipped_content missing from storage | ||||
Args: | Args: | ||||
content: iterable of dictionaries containing the data for each | content: iterable of dictionaries containing the data for each | ||||
checksum algorithm. | checksum algorithm. | ||||
Returns: | Returns: | ||||
iterable: missing signatures | iterable: missing signatures | ||||
""" | """ | ||||
for content in db.skipped_content_missing(contents, cur): | for content in db.skipped_content_missing(contents, cur): | ||||
yield dict(zip(db.content_hash_keys, content)) | yield dict(zip(db.content_hash_keys, content)) | ||||
@remote_api_endpoint('content/present') | |||||
@db_transaction() | @db_transaction() | ||||
def content_find(self, content, db=None, cur=None): | def content_find(self, content, db=None, cur=None): | ||||
"""Find a content hash in db. | """Find a content hash in db. | ||||
Args: | Args: | ||||
content: a dictionary representing one content hash, mapping | content: a dictionary representing one content hash, mapping | ||||
checksum algorithm names (see swh.model.hashutil.ALGORITHMS) to | checksum algorithm names (see swh.model.hashutil.ALGORITHMS) to | ||||
checksum values | checksum values | ||||
Show All 14 Lines | def content_find(self, content, db=None, cur=None): | ||||
contents = db.content_find(sha1=content.get('sha1'), | contents = db.content_find(sha1=content.get('sha1'), | ||||
sha1_git=content.get('sha1_git'), | sha1_git=content.get('sha1_git'), | ||||
sha256=content.get('sha256'), | sha256=content.get('sha256'), | ||||
blake2s256=content.get('blake2s256'), | blake2s256=content.get('blake2s256'), | ||||
cur=cur) | cur=cur) | ||||
return [dict(zip(db.content_find_cols, content)) | return [dict(zip(db.content_find_cols, content)) | ||||
for content in contents] | for content in contents] | ||||
@remote_api_endpoint('directory/add') | |||||
@db_transaction() | @db_transaction() | ||||
def directory_add(self, directories, db, cur): | def directory_add(self, directories, db=None, cur=None): | ||||
"""Add directories to the storage | """Add directories to the storage | ||||
Args: | Args: | ||||
directories (iterable): iterable of dictionaries representing the | directories (iterable): iterable of dictionaries representing the | ||||
individual directories to add. Each dict has the following | individual directories to add. Each dict has the following | ||||
keys: | keys: | ||||
- id (sha1_git): the id of the directory to add | - id (sha1_git): the id of the directory to add | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | def directory_add(self, directories, db=None, cur=None): | ||||
) | ) | ||||
# Do the final copy | # Do the final copy | ||||
db.directory_add_from_temp(cur) | db.directory_add_from_temp(cur) | ||||
summary['directory:add'] = len(dirs_missing) | summary['directory:add'] = len(dirs_missing) | ||||
return summary | return summary | ||||
@remote_api_endpoint('directory/missing') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def directory_missing(self, directories, db=None, cur=None): | def directory_missing(self, directories, db=None, cur=None): | ||||
"""List directories missing from storage | """List directories missing from storage | ||||
Args: | Args: | ||||
directories (iterable): an iterable of directory ids | directories (iterable): an iterable of directory ids | ||||
Yields: | Yields: | ||||
missing directory ids | missing directory ids | ||||
""" | """ | ||||
for obj in db.directory_missing_from_list(directories, cur): | for obj in db.directory_missing_from_list(directories, cur): | ||||
yield obj[0] | yield obj[0] | ||||
@remote_api_endpoint('directory/ls') | |||||
@db_transaction_generator(statement_timeout=20000) | @db_transaction_generator(statement_timeout=20000) | ||||
def directory_ls(self, directory, recursive=False, db=None, cur=None): | def directory_ls(self, directory, recursive=False, db=None, cur=None): | ||||
"""Get entries for one directory. | """Get entries for one directory. | ||||
Args: | Args: | ||||
- directory: the directory to list entries from. | - directory: the directory to list entries from. | ||||
- recursive: if flag on, this list recursively from this directory. | - recursive: if flag on, this list recursively from this directory. | ||||
Returns: | Returns: | ||||
List of entries for such directory. | List of entries for such directory. | ||||
If `recursive=True`, names in the path of a dir/file not at the | If `recursive=True`, names in the path of a dir/file not at the | ||||
root are concatenated with a slash (`/`). | root are concatenated with a slash (`/`). | ||||
""" | """ | ||||
if recursive: | if recursive: | ||||
res_gen = db.directory_walk(directory, cur=cur) | res_gen = db.directory_walk(directory, cur=cur) | ||||
else: | else: | ||||
res_gen = db.directory_walk_one(directory, cur=cur) | res_gen = db.directory_walk_one(directory, cur=cur) | ||||
for line in res_gen: | for line in res_gen: | ||||
yield dict(zip(db.directory_ls_cols, line)) | yield dict(zip(db.directory_ls_cols, line)) | ||||
@remote_api_endpoint('directory/path') | |||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def directory_entry_get_by_path(self, directory, paths, db=None, cur=None): | def directory_entry_get_by_path(self, directory, paths, db=None, cur=None): | ||||
"""Get the directory entry (either file or dir) from directory with path. | """Get the directory entry (either file or dir) from directory with path. | ||||
Args: | Args: | ||||
- directory: sha1 of the top level directory | - directory: sha1 of the top level directory | ||||
- paths: path to lookup from the top level directory. From left | - paths: path to lookup from the top level directory. From left | ||||
(top) to right (bottom). | (top) to right (bottom). | ||||
Returns: | Returns: | ||||
The corresponding directory entry if found, None otherwise. | The corresponding directory entry if found, None otherwise. | ||||
""" | """ | ||||
res = db.directory_entry_get_by_path(directory, paths, cur) | res = db.directory_entry_get_by_path(directory, paths, cur) | ||||
if res: | if res: | ||||
return dict(zip(db.directory_ls_cols, res)) | return dict(zip(db.directory_ls_cols, res)) | ||||
@remote_api_endpoint('revision/add') | |||||
@db_transaction() | @db_transaction() | ||||
def revision_add(self, revisions, db, cur): | def revision_add(self, revisions, db=None, cur=None): | ||||
"""Add revisions to the storage | """Add revisions to the storage | ||||
Args: | Args: | ||||
revisions (Iterable[dict]): iterable of dictionaries representing | revisions (Iterable[dict]): iterable of dictionaries representing | ||||
the individual revisions to add. Each dict has the following | the individual revisions to add. Each dict has the following | ||||
keys: | keys: | ||||
- **id** (:class:`sha1_git`): id of the revision to add | - **id** (:class:`sha1_git`): id of the revision to add | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | def revision_add(self, revisions, db=None, cur=None): | ||||
db.revision_add_from_temp(cur) | db.revision_add_from_temp(cur) | ||||
db.copy_to(parents_filtered, 'revision_history', | db.copy_to(parents_filtered, 'revision_history', | ||||
['id', 'parent_id', 'parent_rank'], cur) | ['id', 'parent_id', 'parent_rank'], cur) | ||||
return {'revision:add': len(revisions_missing)} | return {'revision:add': len(revisions_missing)} | ||||
@remote_api_endpoint('revision/missing') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def revision_missing(self, revisions, db=None, cur=None): | def revision_missing(self, revisions, db=None, cur=None): | ||||
"""List revisions missing from storage | """List revisions missing from storage | ||||
Args: | Args: | ||||
revisions (iterable): revision ids | revisions (iterable): revision ids | ||||
Yields: | Yields: | ||||
missing revision ids | missing revision ids | ||||
""" | """ | ||||
if not revisions: | if not revisions: | ||||
return | return | ||||
for obj in db.revision_missing_from_list(revisions, cur): | for obj in db.revision_missing_from_list(revisions, cur): | ||||
yield obj[0] | yield obj[0] | ||||
@remote_api_endpoint('revision') | |||||
@db_transaction_generator(statement_timeout=1000) | @db_transaction_generator(statement_timeout=1000) | ||||
def revision_get(self, revisions, db=None, cur=None): | def revision_get(self, revisions, db=None, cur=None): | ||||
"""Get all revisions from storage | """Get all revisions from storage | ||||
Args: | Args: | ||||
revisions: an iterable of revision ids | revisions: an iterable of revision ids | ||||
Returns: | Returns: | ||||
iterable: an iterable of revisions as dictionaries (or None if the | iterable: an iterable of revisions as dictionaries (or None if the | ||||
revision doesn't exist) | revision doesn't exist) | ||||
""" | """ | ||||
for line in db.revision_get_from_list(revisions, cur): | for line in db.revision_get_from_list(revisions, cur): | ||||
data = converters.db_to_revision( | data = converters.db_to_revision( | ||||
dict(zip(db.revision_get_cols, line)) | dict(zip(db.revision_get_cols, line)) | ||||
) | ) | ||||
if not data['type']: | if not data['type']: | ||||
yield None | yield None | ||||
continue | continue | ||||
yield data | yield data | ||||
@remote_api_endpoint('revision/log') | |||||
@db_transaction_generator(statement_timeout=2000) | @db_transaction_generator(statement_timeout=2000) | ||||
def revision_log(self, revisions, limit=None, db=None, cur=None): | def revision_log(self, revisions, limit=None, db=None, cur=None): | ||||
"""Fetch revision entry from the given root revisions. | """Fetch revision entry from the given root revisions. | ||||
Args: | Args: | ||||
revisions: array of root revision to lookup | revisions: array of root revision to lookup | ||||
limit: limitation on the output result. Default to None. | limit: limitation on the output result. Default to None. | ||||
Yields: | Yields: | ||||
List of revision log from such revisions root. | List of revision log from such revisions root. | ||||
""" | """ | ||||
for line in db.revision_log(revisions, limit, cur): | for line in db.revision_log(revisions, limit, cur): | ||||
data = converters.db_to_revision( | data = converters.db_to_revision( | ||||
dict(zip(db.revision_get_cols, line)) | dict(zip(db.revision_get_cols, line)) | ||||
) | ) | ||||
if not data['type']: | if not data['type']: | ||||
yield None | yield None | ||||
continue | continue | ||||
yield data | yield data | ||||
@remote_api_endpoint('revision/shortlog') | |||||
@db_transaction_generator(statement_timeout=2000) | @db_transaction_generator(statement_timeout=2000) | ||||
def revision_shortlog(self, revisions, limit=None, db=None, cur=None): | def revision_shortlog(self, revisions, limit=None, db=None, cur=None): | ||||
"""Fetch the shortlog for the given revisions | """Fetch the shortlog for the given revisions | ||||
Args: | Args: | ||||
revisions: list of root revisions to lookup | revisions: list of root revisions to lookup | ||||
limit: depth limitation for the output | limit: depth limitation for the output | ||||
Yields: | Yields: | ||||
a list of (id, parents) tuples. | a list of (id, parents) tuples. | ||||
""" | """ | ||||
yield from db.revision_shortlog(revisions, limit, cur) | yield from db.revision_shortlog(revisions, limit, cur) | ||||
@remote_api_endpoint('release/add') | |||||
@db_transaction() | @db_transaction() | ||||
def release_add(self, releases, db, cur): | def release_add(self, releases, db=None, cur=None): | ||||
"""Add releases to the storage | """Add releases to the storage | ||||
Args: | Args: | ||||
releases (Iterable[dict]): iterable of dictionaries representing | releases (Iterable[dict]): iterable of dictionaries representing | ||||
the individual releases to add. Each dict has the following | the individual releases to add. Each dict has the following | ||||
keys: | keys: | ||||
- **id** (:class:`sha1_git`): id of the release to add | - **id** (:class:`sha1_git`): id of the release to add | ||||
Show All 39 Lines | def release_add(self, releases, db=None, cur=None): | ||||
db.copy_to(releases_filtered, 'tmp_release', db.release_add_cols, | db.copy_to(releases_filtered, 'tmp_release', db.release_add_cols, | ||||
cur) | cur) | ||||
db.release_add_from_temp(cur) | db.release_add_from_temp(cur) | ||||
return {'release:add': len(releases_missing)} | return {'release:add': len(releases_missing)} | ||||
@remote_api_endpoint('release/missing') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def release_missing(self, releases, db=None, cur=None): | def release_missing(self, releases, db=None, cur=None): | ||||
"""List releases missing from storage | """List releases missing from storage | ||||
Args: | Args: | ||||
releases: an iterable of release ids | releases: an iterable of release ids | ||||
Returns: | Returns: | ||||
a list of missing release ids | a list of missing release ids | ||||
""" | """ | ||||
if not releases: | if not releases: | ||||
return | return | ||||
for obj in db.release_missing_from_list(releases, cur): | for obj in db.release_missing_from_list(releases, cur): | ||||
yield obj[0] | yield obj[0] | ||||
@remote_api_endpoint('release') | |||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def release_get(self, releases, db=None, cur=None): | def release_get(self, releases, db=None, cur=None): | ||||
"""Given a list of sha1, return the releases's information | """Given a list of sha1, return the releases's information | ||||
Args: | Args: | ||||
releases: list of sha1s | releases: list of sha1s | ||||
Yields: | Yields: | ||||
dicts with the same keys as those given to `release_add` | dicts with the same keys as those given to `release_add` | ||||
(or ``None`` if a release does not exist) | (or ``None`` if a release does not exist) | ||||
""" | """ | ||||
for release in db.release_get_from_list(releases, cur): | for release in db.release_get_from_list(releases, cur): | ||||
data = converters.db_to_release( | data = converters.db_to_release( | ||||
dict(zip(db.release_get_cols, release)) | dict(zip(db.release_get_cols, release)) | ||||
) | ) | ||||
yield data if data['target_type'] else None | yield data if data['target_type'] else None | ||||
@remote_api_endpoint('snapshot/add') | |||||
@db_transaction() | @db_transaction() | ||||
def snapshot_add(self, snapshots, db=None, cur=None): | def snapshot_add(self, snapshots, db=None, cur=None): | ||||
"""Add snapshots to the storage. | """Add snapshots to the storage. | ||||
Args: | Args: | ||||
snapshot ([dict]): the snapshots to add, containing the | snapshot ([dict]): the snapshots to add, containing the | ||||
following keys: | following keys: | ||||
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines | def snapshot_add(self, snapshots, db=None, cur=None): | ||||
if self.journal_writer: | if self.journal_writer: | ||||
self.journal_writer.write_addition('snapshot', snapshot) | self.journal_writer.write_addition('snapshot', snapshot) | ||||
db.snapshot_add(snapshot['id'], cur) | db.snapshot_add(snapshot['id'], cur) | ||||
count += 1 | count += 1 | ||||
return {'snapshot:add': count} | return {'snapshot:add': count} | ||||
@remote_api_endpoint('snapshot') | |||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def snapshot_get(self, snapshot_id, db=None, cur=None): | def snapshot_get(self, snapshot_id, db=None, cur=None): | ||||
"""Get the content, possibly partial, of a snapshot with the given id | """Get the content, possibly partial, of a snapshot with the given id | ||||
The branches of the snapshot are iterated in the lexicographical | The branches of the snapshot are iterated in the lexicographical | ||||
order of their names. | order of their names. | ||||
.. warning:: At most 1000 branches contained in the snapshot will be | .. warning:: At most 1000 branches contained in the snapshot will be | ||||
Show All 10 Lines | def snapshot_get(self, snapshot_id, db=None, cur=None): | ||||
whose keys are the branches' names. | whose keys are the branches' names. | ||||
* **next_branch**: the name of the first branch not returned | * **next_branch**: the name of the first branch not returned | ||||
or :const:`None` if the snapshot has less than 1000 | or :const:`None` if the snapshot has less than 1000 | ||||
branches. | branches. | ||||
""" | """ | ||||
return self.snapshot_get_branches(snapshot_id, db=db, cur=cur) | return self.snapshot_get_branches(snapshot_id, db=db, cur=cur) | ||||
@remote_api_endpoint('snapshot/by_origin_visit') | |||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None): | def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None): | ||||
"""Get the content, possibly partial, of a snapshot for the given origin visit | """Get the content, possibly partial, of a snapshot for the given origin visit | ||||
The branches of the snapshot are iterated in the lexicographical | The branches of the snapshot are iterated in the lexicographical | ||||
order of their names. | order of their names. | ||||
.. warning:: At most 1000 branches contained in the snapshot will be | .. warning:: At most 1000 branches contained in the snapshot will be | ||||
Show All 17 Lines | def snapshot_get_by_origin_visit(self, origin, visit, db=None, cur=None): | ||||
""" | """ | ||||
snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur) | snapshot_id = db.snapshot_get_by_origin_visit(origin, visit, cur) | ||||
if snapshot_id: | if snapshot_id: | ||||
return self.snapshot_get(snapshot_id, db=db, cur=cur) | return self.snapshot_get(snapshot_id, db=db, cur=cur) | ||||
return None | return None | ||||
@remote_api_endpoint('snapshot/latest') | |||||
@db_transaction(statement_timeout=4000) | @db_transaction(statement_timeout=4000) | ||||
def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, | def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, | ||||
cur=None): | cur=None): | ||||
"""Get the content, possibly partial, of the latest snapshot for the | """Get the content, possibly partial, of the latest snapshot for the | ||||
given origin, optionally only from visits that have one of the given | given origin, optionally only from visits that have one of the given | ||||
allowed_statuses | allowed_statuses | ||||
The branches of the snapshot are iterated in the lexicographical | The branches of the snapshot are iterated in the lexicographical | ||||
Show All 31 Lines | def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, | ||||
if origin_visit and origin_visit['snapshot']: | if origin_visit and origin_visit['snapshot']: | ||||
snapshot = self.snapshot_get( | snapshot = self.snapshot_get( | ||||
origin_visit['snapshot'], db=db, cur=cur) | origin_visit['snapshot'], db=db, cur=cur) | ||||
if not snapshot: | if not snapshot: | ||||
raise ValueError( | raise ValueError( | ||||
'last origin visit references an unknown snapshot') | 'last origin visit references an unknown snapshot') | ||||
return snapshot | return snapshot | ||||
@remote_api_endpoint('snapshot/count_branches') | |||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def snapshot_count_branches(self, snapshot_id, db=None, cur=None): | def snapshot_count_branches(self, snapshot_id, db=None, cur=None): | ||||
"""Count the number of branches in the snapshot with the given id | """Count the number of branches in the snapshot with the given id | ||||
Args: | Args: | ||||
snapshot_id (bytes): identifier of the snapshot | snapshot_id (bytes): identifier of the snapshot | ||||
Returns: | Returns: | ||||
dict: A dict whose keys are the target types of branches and | dict: A dict whose keys are the target types of branches and | ||||
values their corresponding amount | values their corresponding amount | ||||
""" | """ | ||||
return dict([bc for bc in | return dict([bc for bc in | ||||
db.snapshot_count_branches(snapshot_id, cur)]) | db.snapshot_count_branches(snapshot_id, cur)]) | ||||
@remote_api_endpoint('snapshot/get_branches') | |||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def snapshot_get_branches(self, snapshot_id, branches_from=b'', | def snapshot_get_branches(self, snapshot_id, branches_from=b'', | ||||
branches_count=1000, target_types=None, | branches_count=1000, target_types=None, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Get the content, possibly partial, of a snapshot with the given id | """Get the content, possibly partial, of a snapshot with the given id | ||||
The branches of the snapshot are iterated in the lexicographical | The branches of the snapshot are iterated in the lexicographical | ||||
order of their names. | order of their names. | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | def snapshot_get_branches(self, snapshot_id, branches_from=b'', | ||||
return { | return { | ||||
'id': snapshot_id, | 'id': snapshot_id, | ||||
'branches': branches, | 'branches': branches, | ||||
'next_branch': next_branch, | 'next_branch': next_branch, | ||||
} | } | ||||
return None | return None | ||||
@remote_api_endpoint('origin/visit/add') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_add(self, origin, date, type, | def origin_visit_add(self, origin, date, type, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Add an origin_visit for the origin at ts with status 'ongoing'. | """Add an origin_visit for the origin at ts with status 'ongoing'. | ||||
Args: | Args: | ||||
origin (str): visited origin's identifier or URL | origin (str): visited origin's identifier or URL | ||||
date (Union[str,datetime]): timestamp of such visit | date (Union[str,datetime]): timestamp of such visit | ||||
Show All 22 Lines | def origin_visit_add(self, origin, date, type, | ||||
'visit': visit_id, | 'visit': visit_id, | ||||
'status': 'ongoing', 'metadata': None, 'snapshot': None}) | 'status': 'ongoing', 'metadata': None, 'snapshot': None}) | ||||
return { | return { | ||||
'origin': origin_url, | 'origin': origin_url, | ||||
'visit': visit_id, | 'visit': visit_id, | ||||
} | } | ||||
@remote_api_endpoint('origin/visit/update') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_update(self, origin, visit_id, status=None, | def origin_visit_update(self, origin, visit_id, status=None, | ||||
metadata=None, snapshot=None, | metadata=None, snapshot=None, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Update an origin_visit's status. | """Update an origin_visit's status. | ||||
Args: | Args: | ||||
origin (str): visited origin's URL | origin (str): visited origin's URL | ||||
Show All 27 Lines | def origin_visit_update(self, origin, visit_id, status=None, | ||||
if updates: | if updates: | ||||
if self.journal_writer: | if self.journal_writer: | ||||
self.journal_writer.write_update('origin_visit', { | self.journal_writer.write_update('origin_visit', { | ||||
**visit, **updates}) | **visit, **updates}) | ||||
db.origin_visit_update(origin_url, visit_id, updates, cur) | db.origin_visit_update(origin_url, visit_id, updates, cur) | ||||
@remote_api_endpoint('origin/visit/upsert') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_upsert(self, visits, db=None, cur=None): | def origin_visit_upsert(self, visits, db=None, cur=None): | ||||
"""Add a origin_visits with a specific id and with all its data. | """Add a origin_visits with a specific id and with all its data. | ||||
If there is already an origin_visit with the same | If there is already an origin_visit with the same | ||||
`(origin_id, visit_id)`, overwrites it. | `(origin_id, visit_id)`, overwrites it. | ||||
Args: | Args: | ||||
visits: iterable of dicts with keys: | visits: iterable of dicts with keys: | ||||
Show All 17 Lines | def origin_visit_upsert(self, visits, db=None, cur=None): | ||||
if self.journal_writer: | if self.journal_writer: | ||||
for visit in visits: | for visit in visits: | ||||
self.journal_writer.write_addition('origin_visit', visit) | self.journal_writer.write_addition('origin_visit', visit) | ||||
for visit in visits: | for visit in visits: | ||||
# TODO: upsert them all in a single query | # TODO: upsert them all in a single query | ||||
db.origin_visit_upsert(**visit, cur=cur) | db.origin_visit_upsert(**visit, cur=cur) | ||||
@remote_api_endpoint('origin/visit/get') | |||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, | def origin_visit_get(self, origin, last_visit=None, limit=None, db=None, | ||||
cur=None): | cur=None): | ||||
"""Retrieve all the origin's visit's information. | """Retrieve all the origin's visit's information. | ||||
Args: | Args: | ||||
origin (str): The visited origin | origin (str): The visited origin | ||||
last_visit: Starting point from which listing the next visits | last_visit: Starting point from which listing the next visits | ||||
Default to None | Default to None | ||||
limit (int): Number of results to return from the last visit. | limit (int): Number of results to return from the last visit. | ||||
Default to None | Default to None | ||||
Yields: | Yields: | ||||
List of visits. | List of visits. | ||||
""" | """ | ||||
for line in db.origin_visit_get_all( | for line in db.origin_visit_get_all( | ||||
origin, last_visit=last_visit, limit=limit, cur=cur): | origin, last_visit=last_visit, limit=limit, cur=cur): | ||||
data = dict(zip(db.origin_visit_get_cols, line)) | data = dict(zip(db.origin_visit_get_cols, line)) | ||||
yield data | yield data | ||||
@remote_api_endpoint('origin/visit/find_by_date') | |||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): | def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): | ||||
"""Retrieves the origin visit whose date is closest to the provided | """Retrieves the origin visit whose date is closest to the provided | ||||
timestamp. | timestamp. | ||||
In case of a tie, the visit with largest id is selected. | In case of a tie, the visit with largest id is selected. | ||||
Args: | Args: | ||||
origin (str): The occurrence's origin (URL). | origin (str): The occurrence's origin (URL). | ||||
target (datetime): target timestamp | target (datetime): target timestamp | ||||
Returns: | Returns: | ||||
A visit. | A visit. | ||||
""" | """ | ||||
line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) | line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) | ||||
if line: | if line: | ||||
return dict(zip(db.origin_visit_get_cols, line)) | return dict(zip(db.origin_visit_get_cols, line)) | ||||
@remote_api_endpoint('origin/visit/getby') | |||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_visit_get_by(self, origin, visit, db=None, cur=None): | def origin_visit_get_by(self, origin, visit, db=None, cur=None): | ||||
"""Retrieve origin visit's information. | """Retrieve origin visit's information. | ||||
Args: | Args: | ||||
origin: The occurrence's origin (identifier). | origin: The occurrence's origin (identifier). | ||||
Returns: | Returns: | ||||
The information on that particular (origin, visit) or None if | The information on that particular (origin, visit) or None if | ||||
it does not exist | it does not exist | ||||
""" | """ | ||||
ori_visit = db.origin_visit_get(origin, visit, cur) | ori_visit = db.origin_visit_get(origin, visit, cur) | ||||
if not ori_visit: | if not ori_visit: | ||||
return None | return None | ||||
return dict(zip(db.origin_visit_get_cols, ori_visit)) | return dict(zip(db.origin_visit_get_cols, ori_visit)) | ||||
@remote_api_endpoint('origin/visit/get_latest') | |||||
@db_transaction(statement_timeout=4000) | @db_transaction(statement_timeout=4000) | ||||
def origin_visit_get_latest( | def origin_visit_get_latest( | ||||
self, origin, allowed_statuses=None, require_snapshot=False, | self, origin, allowed_statuses=None, require_snapshot=False, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Get the latest origin visit for the given origin, optionally | """Get the latest origin visit for the given origin, optionally | ||||
looking only for those with one of the given allowed_statuses | looking only for those with one of the given allowed_statuses | ||||
or for those with a known snapshot. | or for those with a known snapshot. | ||||
Show All 18 Lines | def origin_visit_get_latest( | ||||
associated to the visit | associated to the visit | ||||
""" | """ | ||||
origin_visit = db.origin_visit_get_latest( | origin_visit = db.origin_visit_get_latest( | ||||
origin, allowed_statuses=allowed_statuses, | origin, allowed_statuses=allowed_statuses, | ||||
require_snapshot=require_snapshot, cur=cur) | require_snapshot=require_snapshot, cur=cur) | ||||
if origin_visit: | if origin_visit: | ||||
return dict(zip(db.origin_visit_get_cols, origin_visit)) | return dict(zip(db.origin_visit_get_cols, origin_visit)) | ||||
@remote_api_endpoint('object/find_by_sha1_git') | |||||
@db_transaction(statement_timeout=2000) | @db_transaction(statement_timeout=2000) | ||||
def object_find_by_sha1_git(self, ids, db=None, cur=None): | def object_find_by_sha1_git(self, ids, db=None, cur=None): | ||||
"""Return the objects found with the given ids. | """Return the objects found with the given ids. | ||||
Args: | Args: | ||||
ids: a generator of sha1_gits | ids: a generator of sha1_gits | ||||
Returns: | Returns: | ||||
Show All 10 Lines | def object_find_by_sha1_git(self, ids, db=None, cur=None): | ||||
for retval in db.object_find_by_sha1_git(ids, cur=cur): | for retval in db.object_find_by_sha1_git(ids, cur=cur): | ||||
if retval[1]: | if retval[1]: | ||||
ret[retval[0]].append(dict(zip(db.object_find_by_sha1_git_cols, | ret[retval[0]].append(dict(zip(db.object_find_by_sha1_git_cols, | ||||
retval))) | retval))) | ||||
return ret | return ret | ||||
@remote_api_endpoint('origin/get') | |||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def origin_get(self, origins, db=None, cur=None): | def origin_get(self, origins, db=None, cur=None): | ||||
"""Return origins, either all identified by their ids or all | """Return origins, either all identified by their ids or all | ||||
identified by tuples (type, url). | identified by tuples (type, url). | ||||
If the url is given and the type is omitted, one of the origins with | If the url is given and the type is omitted, one of the origins with | ||||
that url is returned. | that url is returned. | ||||
Show All 32 Lines | def origin_get(self, origins, db=None, cur=None): | ||||
assert len(results) == 1 | assert len(results) == 1 | ||||
if results[0]['url'] is not None: | if results[0]['url'] is not None: | ||||
return results[0] | return results[0] | ||||
else: | else: | ||||
return None | return None | ||||
else: | else: | ||||
return [None if res['url'] is None else res for res in results] | return [None if res['url'] is None else res for res in results] | ||||
@remote_api_endpoint('origin/get_sha1') | |||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def origin_get_by_sha1(self, sha1s, db=None, cur=None): | def origin_get_by_sha1(self, sha1s, db=None, cur=None): | ||||
"""Return origins, identified by the sha1 of their URLs. | """Return origins, identified by the sha1 of their URLs. | ||||
Args: | Args: | ||||
sha1s (list[bytes]): a list of sha1s | sha1s (list[bytes]): a list of sha1s | ||||
Yields: | Yields: | ||||
dicts containing origin information as returned | dicts containing origin information as returned | ||||
by :meth:`swh.storage.storage.Storage.origin_get`, or None if an | by :meth:`swh.storage.storage.Storage.origin_get`, or None if an | ||||
origin matching the sha1 is not found. | origin matching the sha1 is not found. | ||||
""" | """ | ||||
for line in db.origin_get_by_sha1(sha1s, cur): | for line in db.origin_get_by_sha1(sha1s, cur): | ||||
if line[0] is not None: | if line[0] is not None: | ||||
yield dict(zip(db.origin_cols, line)) | yield dict(zip(db.origin_cols, line)) | ||||
else: | else: | ||||
yield None | yield None | ||||
@remote_api_endpoint('origin/visit/get_random') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_visit_get_random( | def origin_visit_get_random( | ||||
self, type, db=None, cur=None) -> Mapping[str, Any]: | self, type, db=None, cur=None) -> Mapping[str, Any]: | ||||
"""Randomly select one origin from the archive | """Randomly select one origin from the archive | ||||
Returns: | Returns: | ||||
origin dict selected randomly on the dataset if found | origin dict selected randomly on the dataset if found | ||||
""" | """ | ||||
data: Dict[str, Any] = {} | data: Dict[str, Any] = {} | ||||
result = db.origin_visit_get_random(type, cur) | result = db.origin_visit_get_random(type, cur) | ||||
if result: | if result: | ||||
data = dict(zip(db.origin_visit_get_cols, result)) | data = dict(zip(db.origin_visit_get_cols, result)) | ||||
return data | return data | ||||
@remote_api_endpoint('origin/get_range') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def origin_get_range(self, origin_from=1, origin_count=100, | def origin_get_range(self, origin_from=1, origin_count=100, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Retrieve ``origin_count`` origins whose ids are greater | """Retrieve ``origin_count`` origins whose ids are greater | ||||
or equal than ``origin_from``. | or equal than ``origin_from``. | ||||
Origins are sorted by id before retrieving them. | Origins are sorted by id before retrieving them. | ||||
Args: | Args: | ||||
origin_from (int): the minimum id of origins to retrieve | origin_from (int): the minimum id of origins to retrieve | ||||
origin_count (int): the maximum number of origins to retrieve | origin_count (int): the maximum number of origins to retrieve | ||||
Yields: | Yields: | ||||
dicts containing origin information as returned | dicts containing origin information as returned | ||||
by :meth:`swh.storage.storage.Storage.origin_get`. | by :meth:`swh.storage.storage.Storage.origin_get`. | ||||
""" | """ | ||||
for origin in db.origin_get_range(origin_from, origin_count, cur): | for origin in db.origin_get_range(origin_from, origin_count, cur): | ||||
yield dict(zip(db.origin_get_range_cols, origin)) | yield dict(zip(db.origin_get_range_cols, origin)) | ||||
@remote_api_endpoint('origin/search') | |||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def origin_search(self, url_pattern, offset=0, limit=50, | def origin_search(self, url_pattern, offset=0, limit=50, | ||||
regexp=False, with_visit=False, db=None, cur=None): | regexp=False, with_visit=False, db=None, cur=None): | ||||
"""Search for origins whose urls contain a provided string pattern | """Search for origins whose urls contain a provided string pattern | ||||
or match a provided regular expression. | or match a provided regular expression. | ||||
The search is performed in a case insensitive way. | The search is performed in a case insensitive way. | ||||
Args: | Args: | ||||
url_pattern (str): the string pattern to search for in origin urls | url_pattern (str): the string pattern to search for in origin urls | ||||
offset (int): number of found origins to skip before returning | offset (int): number of found origins to skip before returning | ||||
results | results | ||||
limit (int): the maximum number of found origins to return | limit (int): the maximum number of found origins to return | ||||
regexp (bool): if True, consider the provided pattern as a regular | regexp (bool): if True, consider the provided pattern as a regular | ||||
expression and return origins whose urls match it | expression and return origins whose urls match it | ||||
with_visit (bool): if True, filter out origins with no visit | with_visit (bool): if True, filter out origins with no visit | ||||
Yields: | Yields: | ||||
dicts containing origin information as returned | dicts containing origin information as returned | ||||
by :meth:`swh.storage.storage.Storage.origin_get`. | by :meth:`swh.storage.storage.Storage.origin_get`. | ||||
""" | """ | ||||
for origin in db.origin_search(url_pattern, offset, limit, | for origin in db.origin_search(url_pattern, offset, limit, | ||||
regexp, with_visit, cur): | regexp, with_visit, cur): | ||||
yield dict(zip(db.origin_cols, origin)) | yield dict(zip(db.origin_cols, origin)) | ||||
@remote_api_endpoint('origin/count') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_count(self, url_pattern, regexp=False, | def origin_count(self, url_pattern, regexp=False, | ||||
with_visit=False, db=None, cur=None): | with_visit=False, db=None, cur=None): | ||||
"""Count origins whose urls contain a provided string pattern | """Count origins whose urls contain a provided string pattern | ||||
or match a provided regular expression. | or match a provided regular expression. | ||||
The pattern search in origin urls is performed in a case insensitive | The pattern search in origin urls is performed in a case insensitive | ||||
way. | way. | ||||
Args: | Args: | ||||
url_pattern (str): the string pattern to search for in origin urls | url_pattern (str): the string pattern to search for in origin urls | ||||
regexp (bool): if True, consider the provided pattern as a regular | regexp (bool): if True, consider the provided pattern as a regular | ||||
expression and return origins whose urls match it | expression and return origins whose urls match it | ||||
with_visit (bool): if True, filter out origins with no visit | with_visit (bool): if True, filter out origins with no visit | ||||
Returns: | Returns: | ||||
int: The number of origins matching the search criterion. | int: The number of origins matching the search criterion. | ||||
""" | """ | ||||
return db.origin_count(url_pattern, regexp, with_visit, cur) | return db.origin_count(url_pattern, regexp, with_visit, cur) | ||||
@remote_api_endpoint('origin/add_multi') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_add(self, origins, db=None, cur=None): | def origin_add(self, origins, db=None, cur=None): | ||||
"""Add origins to the storage | """Add origins to the storage | ||||
Args: | Args: | ||||
origins: list of dictionaries representing the individual origins, | origins: list of dictionaries representing the individual origins, | ||||
with the following keys: | with the following keys: | ||||
- type: the origin type ('git', 'svn', 'deb', ...) | - type: the origin type ('git', 'svn', 'deb', ...) | ||||
- url (bytes): the url the origin points to | - url (bytes): the url the origin points to | ||||
Returns: | Returns: | ||||
list: given origins as dict updated with their id | list: given origins as dict updated with their id | ||||
""" | """ | ||||
origins = copy.deepcopy(origins) | origins = copy.deepcopy(origins) | ||||
for origin in origins: | for origin in origins: | ||||
self.origin_add_one(origin, db=db, cur=cur) | self.origin_add_one(origin, db=db, cur=cur) | ||||
return origins | return origins | ||||
@remote_api_endpoint('origin/add') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_add_one(self, origin, db=None, cur=None): | def origin_add_one(self, origin, db=None, cur=None): | ||||
"""Add origin to the storage | """Add origin to the storage | ||||
Args: | Args: | ||||
origin: dictionary representing the individual origin to add. This | origin: dictionary representing the individual origin to add. This | ||||
dict has the following keys: | dict has the following keys: | ||||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | def refresh_stat_counters(self, db=None, cur=None): | ||||
'revision', | 'revision', | ||||
'revision_history', | 'revision_history', | ||||
'skipped_content', | 'skipped_content', | ||||
'snapshot'] | 'snapshot'] | ||||
for key in keys: | for key in keys: | ||||
cur.execute('select * from swh_update_counter(%s)', (key,)) | cur.execute('select * from swh_update_counter(%s)', (key,)) | ||||
@remote_api_endpoint('origin/metadata/add') | |||||
@db_transaction() | @db_transaction() | ||||
def origin_metadata_add(self, origin_url, ts, provider, tool, metadata, | def origin_metadata_add(self, origin_url, ts, provider, tool, metadata, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
""" Add an origin_metadata for the origin at ts with provenance and | """ Add an origin_metadata for the origin at ts with provenance and | ||||
metadata. | metadata. | ||||
Args: | Args: | ||||
origin_url (str): the origin url for which the metadata is added | origin_url (str): the origin url for which the metadata is added | ||||
ts (datetime): timestamp of the found metadata | ts (datetime): timestamp of the found metadata | ||||
provider (int): the provider of metadata (ex:'hal') | provider (int): the provider of metadata (ex:'hal') | ||||
tool (int): tool used to extract metadata | tool (int): tool used to extract metadata | ||||
metadata (jsonb): the metadata retrieved at the time and location | metadata (jsonb): the metadata retrieved at the time and location | ||||
""" | """ | ||||
if isinstance(ts, str): | if isinstance(ts, str): | ||||
ts = dateutil.parser.parse(ts) | ts = dateutil.parser.parse(ts) | ||||
db.origin_metadata_add(origin_url, ts, provider, tool, | db.origin_metadata_add(origin_url, ts, provider, tool, | ||||
metadata, cur) | metadata, cur) | ||||
@remote_api_endpoint('origin/metadata/get') | |||||
@db_transaction_generator(statement_timeout=500) | @db_transaction_generator(statement_timeout=500) | ||||
def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, | def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, | ||||
cur=None): | cur=None): | ||||
"""Retrieve list of all origin_metadata entries for the origin_id | """Retrieve list of all origin_metadata entries for the origin_id | ||||
Args: | Args: | ||||
origin_url (str): the origin's URL | origin_url (str): the origin's URL | ||||
provider_type (str): (optional) type of provider | provider_type (str): (optional) type of provider | ||||
Show All 9 Lines | def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, | ||||
- provider_name (str) | - provider_name (str) | ||||
- provider_type (str) | - provider_type (str) | ||||
- provider_url (str) | - provider_url (str) | ||||
""" | """ | ||||
for line in db.origin_metadata_get_by(origin_url, provider_type, cur): | for line in db.origin_metadata_get_by(origin_url, provider_type, cur): | ||||
yield dict(zip(db.origin_metadata_get_cols, line)) | yield dict(zip(db.origin_metadata_get_cols, line)) | ||||
@remote_api_endpoint('tool/add') | |||||
@db_transaction() | @db_transaction() | ||||
def tool_add(self, tools, db=None, cur=None): | def tool_add(self, tools, db=None, cur=None): | ||||
"""Add new tools to the storage. | """Add new tools to the storage. | ||||
Args: | Args: | ||||
tools (iterable of :class:`dict`): Tool information to add to | tools (iterable of :class:`dict`): Tool information to add to | ||||
storage. Each tool is a :class:`dict` with the following keys: | storage. Each tool is a :class:`dict` with the following keys: | ||||
Show All 11 Lines | def tool_add(self, tools, db=None, cur=None): | ||||
db.mktemp_tool(cur) | db.mktemp_tool(cur) | ||||
db.copy_to(tools, 'tmp_tool', | db.copy_to(tools, 'tmp_tool', | ||||
['name', 'version', 'configuration'], | ['name', 'version', 'configuration'], | ||||
cur) | cur) | ||||
tools = db.tool_add_from_temp(cur) | tools = db.tool_add_from_temp(cur) | ||||
return [dict(zip(db.tool_cols, line)) for line in tools] | return [dict(zip(db.tool_cols, line)) for line in tools] | ||||
@remote_api_endpoint('tool/data') | |||||
@db_transaction(statement_timeout=500) | @db_transaction(statement_timeout=500) | ||||
def tool_get(self, tool, db=None, cur=None): | def tool_get(self, tool, db=None, cur=None): | ||||
"""Retrieve tool information. | """Retrieve tool information. | ||||
Args: | Args: | ||||
tool (dict): Tool information we want to retrieve from storage. | tool (dict): Tool information we want to retrieve from storage. | ||||
The dicts have the same keys as those used in :func:`tool_add`. | The dicts have the same keys as those used in :func:`tool_add`. | ||||
Returns: | Returns: | ||||
dict: The full tool information if it exists (``id`` included), | dict: The full tool information if it exists (``id`` included), | ||||
None otherwise. | None otherwise. | ||||
""" | """ | ||||
tool_conf = tool['configuration'] | tool_conf = tool['configuration'] | ||||
if isinstance(tool_conf, dict): | if isinstance(tool_conf, dict): | ||||
tool_conf = json.dumps(tool_conf) | tool_conf = json.dumps(tool_conf) | ||||
idx = db.tool_get(tool['name'], | idx = db.tool_get(tool['name'], | ||||
tool['version'], | tool['version'], | ||||
tool_conf) | tool_conf) | ||||
if not idx: | if not idx: | ||||
return None | return None | ||||
return dict(zip(db.tool_cols, idx)) | return dict(zip(db.tool_cols, idx)) | ||||
@remote_api_endpoint('provider/add') | |||||
@db_transaction() | @db_transaction() | ||||
def metadata_provider_add(self, provider_name, provider_type, provider_url, | def metadata_provider_add(self, provider_name, provider_type, provider_url, | ||||
metadata, db=None, cur=None): | metadata, db=None, cur=None): | ||||
"""Add a metadata provider. | """Add a metadata provider. | ||||
Args: | Args: | ||||
provider_name (str): Its name | provider_name (str): Its name | ||||
provider_type (str): Its type (eg. `'deposit-client'`) | provider_type (str): Its type (eg. `'deposit-client'`) | ||||
provider_url (str): Its URL | provider_url (str): Its URL | ||||
metadata: JSON-encodable object | metadata: JSON-encodable object | ||||
Returns: | Returns: | ||||
int: an identifier of the provider | int: an identifier of the provider | ||||
""" | """ | ||||
return db.metadata_provider_add(provider_name, provider_type, | return db.metadata_provider_add(provider_name, provider_type, | ||||
provider_url, metadata, cur) | provider_url, metadata, cur) | ||||
@remote_api_endpoint('provider/get') | |||||
@db_transaction() | @db_transaction() | ||||
def metadata_provider_get(self, provider_id, db=None, cur=None): | def metadata_provider_get(self, provider_id, db=None, cur=None): | ||||
"""Get a metadata provider | """Get a metadata provider | ||||
Args: | Args: | ||||
provider_id: Its identifier, as given by `metadata_provider_add`. | provider_id: Its identifier, as given by `metadata_provider_add`. | ||||
Returns: | Returns: | ||||
dict: same as `metadata_provider_add`; | dict: same as `metadata_provider_add`; | ||||
or None if it does not exist. | or None if it does not exist. | ||||
""" | """ | ||||
result = db.metadata_provider_get(provider_id) | result = db.metadata_provider_get(provider_id) | ||||
if not result: | if not result: | ||||
return None | return None | ||||
return dict(zip(db.metadata_provider_cols, result)) | return dict(zip(db.metadata_provider_cols, result)) | ||||
@remote_api_endpoint('provider/getby') | |||||
@db_transaction() | @db_transaction() | ||||
def metadata_provider_get_by(self, provider, db=None, cur=None): | def metadata_provider_get_by(self, provider, db=None, cur=None): | ||||
"""Get a metadata provider | """Get a metadata provider | ||||
Args: | Args: | ||||
provider (dict): A dictionary with keys: | provider (dict): A dictionary with keys: | ||||
* provider_name: Its name | * provider_name: Its name | ||||
* provider_url: Its URL | * provider_url: Its URL | ||||
Returns: | Returns: | ||||
dict: same as `metadata_provider_add`; | dict: same as `metadata_provider_add`; | ||||
or None if it does not exist. | or None if it does not exist. | ||||
""" | """ | ||||
result = db.metadata_provider_get_by(provider['provider_name'], | result = db.metadata_provider_get_by(provider['provider_name'], | ||||
provider['provider_url']) | provider['provider_url']) | ||||
if not result: | if not result: | ||||
return None | return None | ||||
return dict(zip(db.metadata_provider_cols, result)) | return dict(zip(db.metadata_provider_cols, result)) | ||||
@remote_api_endpoint('algos/diff_directories') | |||||
def diff_directories(self, from_dir, to_dir, track_renaming=False): | def diff_directories(self, from_dir, to_dir, track_renaming=False): | ||||
"""Compute the list of file changes introduced between two arbitrary | """Compute the list of file changes introduced between two arbitrary | ||||
directories (insertion / deletion / modification / renaming of files). | directories (insertion / deletion / modification / renaming of files). | ||||
Args: | Args: | ||||
from_dir (bytes): identifier of the directory to compare from | from_dir (bytes): identifier of the directory to compare from | ||||
to_dir (bytes): identifier of the directory to compare to | to_dir (bytes): identifier of the directory to compare to | ||||
track_renaming (bool): whether or not to track files renaming | track_renaming (bool): whether or not to track files renaming | ||||
Returns: | Returns: | ||||
A list of dict describing the introduced file changes | A list of dict describing the introduced file changes | ||||
(see :func:`swh.storage.algos.diff.diff_directories` | (see :func:`swh.storage.algos.diff.diff_directories` | ||||
for more details). | for more details). | ||||
""" | """ | ||||
return diff.diff_directories(self, from_dir, to_dir, track_renaming) | return diff.diff_directories(self, from_dir, to_dir, track_renaming) | ||||
@remote_api_endpoint('algos/diff_revisions') | |||||
def diff_revisions(self, from_rev, to_rev, track_renaming=False): | def diff_revisions(self, from_rev, to_rev, track_renaming=False): | ||||
"""Compute the list of file changes introduced between two arbitrary | """Compute the list of file changes introduced between two arbitrary | ||||
revisions (insertion / deletion / modification / renaming of files). | revisions (insertion / deletion / modification / renaming of files). | ||||
Args: | Args: | ||||
from_rev (bytes): identifier of the revision to compare from | from_rev (bytes): identifier of the revision to compare from | ||||
to_rev (bytes): identifier of the revision to compare to | to_rev (bytes): identifier of the revision to compare to | ||||
track_renaming (bool): whether or not to track files renaming | track_renaming (bool): whether or not to track files renaming | ||||
Returns: | Returns: | ||||
A list of dict describing the introduced file changes | A list of dict describing the introduced file changes | ||||
(see :func:`swh.storage.algos.diff.diff_directories` | (see :func:`swh.storage.algos.diff.diff_directories` | ||||
for more details). | for more details). | ||||
""" | """ | ||||
return diff.diff_revisions(self, from_rev, to_rev, track_renaming) | return diff.diff_revisions(self, from_rev, to_rev, track_renaming) | ||||
@remote_api_endpoint('algos/diff_revision') | |||||
def diff_revision(self, revision, track_renaming=False): | def diff_revision(self, revision, track_renaming=False): | ||||
"""Compute the list of file changes introduced by a specific revision | """Compute the list of file changes introduced by a specific revision | ||||
(insertion / deletion / modification / renaming of files) by comparing | (insertion / deletion / modification / renaming of files) by comparing | ||||
it against its first parent. | it against its first parent. | ||||
Args: | Args: | ||||
revision (bytes): identifier of the revision from which to | revision (bytes): identifier of the revision from which to | ||||
compute the list of files changes | compute the list of files changes | ||||
track_renaming (bool): whether or not to track files renaming | track_renaming (bool): whether or not to track files renaming | ||||
Returns: | Returns: | ||||
A list of dict describing the introduced file changes | A list of dict describing the introduced file changes | ||||
(see :func:`swh.storage.algos.diff.diff_directories` | (see :func:`swh.storage.algos.diff.diff_directories` | ||||
for more details). | for more details). | ||||
""" | """ | ||||
return diff.diff_revision(self, revision, track_renaming) | return diff.diff_revision(self, revision, track_renaming) |