Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/__init__.py
Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines | def get_indexer_storage(cls, args): | ||||
elif cls == 'memory': | elif cls == 'memory': | ||||
from .in_memory import IndexerStorage | from .in_memory import IndexerStorage | ||||
else: | else: | ||||
raise ValueError('Unknown indexer storage class `%s`' % cls) | raise ValueError('Unknown indexer storage class `%s`' % cls) | ||||
return IndexerStorage(**args) | return IndexerStorage(**args) | ||||
def _check_duplicates(data, key): | def _check_id_duplicates(data): | ||||
""" | """ | ||||
If any two dictionaries in `data` have the same value for the | If any two dictionaries in `data` have the same id, raises | ||||
key, raises a `ValueError`. | a `ValueError`. | ||||
Values associated to the key must be hashable. | Values associated to the key must be hashable. | ||||
Args: | Args: | ||||
data (List[dict]): List of dictionaries to be inserted | data (List[dict]): List of dictionaries to be inserted | ||||
key (str): Name of the key that acts as id. | |||||
>>> _check_duplicates([ | >>> _check_id_duplicates([ | ||||
... {'id': 'foo', 'data': 'spam'}, | ... {'id': 'foo', 'data': 'spam'}, | ||||
... {'id': 'bar', 'data': 'egg'}, | ... {'id': 'bar', 'data': 'egg'}, | ||||
... ], 'id') | ... ]) | ||||
>>> _check_duplicates([ | >>> _check_id_duplicates([ | ||||
... {'id': 'foo', 'data': 'spam'}, | ... {'id': 'foo', 'data': 'spam'}, | ||||
... {'id': 'foo', 'data': 'egg'}, | ... {'id': 'foo', 'data': 'egg'}, | ||||
... ], 'id') | ... ]) | ||||
Traceback (most recent call last): | Traceback (most recent call last): | ||||
... | ... | ||||
ValueError: The same id is present more than once. | ValueError: The same id is present more than once. | ||||
""" | """ | ||||
if len({item[key] for item in data}) < len(data): | if len({item['id'] for item in data}) < len(data): | ||||
raise ValueError( | raise ValueError('The same id is present more than once.') | ||||
'The same {} is present more than once.'.format(key)) | |||||
class IndexerStorage: | class IndexerStorage: | ||||
"""SWH Indexer Storage | """SWH Indexer Storage | ||||
""" | """ | ||||
def __init__(self, db, min_pool_conns=1, max_pool_conns=10): | def __init__(self, db, min_pool_conns=1, max_pool_conns=10): | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 153 Lines • ▼ Show 20 Lines | def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, | ||||
- **encoding** (bytes): raw content's encoding | - **encoding** (bytes): raw content's encoding | ||||
- **indexer_configuration_id** (int): tool's id used to | - **indexer_configuration_id** (int): tool's id used to | ||||
compute the results | compute the results | ||||
- **conflict_update** (bool): Flag to determine if we want to | - **conflict_update** (bool): Flag to determine if we want to | ||||
overwrite (``True``) or skip duplicates (``False``, the | overwrite (``True``) or skip duplicates (``False``, the | ||||
default) | default) | ||||
""" | """ | ||||
_check_duplicates(mimetypes, 'id') | _check_id_duplicates(mimetypes) | ||||
mimetypes.sort(key=lambda m: m['id']) | mimetypes.sort(key=lambda m: m['id']) | ||||
db.mktemp_content_mimetype(cur) | db.mktemp_content_mimetype(cur) | ||||
db.copy_to(mimetypes, 'tmp_content_mimetype', | db.copy_to(mimetypes, 'tmp_content_mimetype', | ||||
['id', 'mimetype', 'encoding', 'indexer_configuration_id'], | ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], | ||||
cur) | cur) | ||||
db.content_mimetype_add_from_temp(conflict_update, cur) | db.content_mimetype_add_from_temp(conflict_update, cur) | ||||
@remote_api_endpoint('content_mimetype') | @remote_api_endpoint('content_mimetype') | ||||
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | def content_language_add(self, languages, conflict_update=False, db=None, | ||||
- **id** (bytes): sha1 | - **id** (bytes): sha1 | ||||
- **lang** (bytes): language detected | - **lang** (bytes): language detected | ||||
conflict_update (bool): Flag to determine if we want to | conflict_update (bool): Flag to determine if we want to | ||||
overwrite (true) or skip duplicates (false, the | overwrite (true) or skip duplicates (false, the | ||||
default) | default) | ||||
""" | """ | ||||
_check_duplicates(languages, 'id') | _check_id_duplicates(languages) | ||||
languages.sort(key=lambda m: m['id']) | languages.sort(key=lambda m: m['id']) | ||||
db.mktemp_content_language(cur) | db.mktemp_content_language(cur) | ||||
# empty language is mapped to 'unknown' | # empty language is mapped to 'unknown' | ||||
db.copy_to( | db.copy_to( | ||||
({ | ({ | ||||
'id': l['id'], | 'id': l['id'], | ||||
'lang': 'unknown' if not l['lang'] else l['lang'], | 'lang': 'unknown' if not l['lang'] else l['lang'], | ||||
'indexer_configuration_id': l['indexer_configuration_id'], | 'indexer_configuration_id': l['indexer_configuration_id'], | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | def content_ctags_add(self, ctags, conflict_update=False, db=None, | ||||
Args: | Args: | ||||
ctags (iterable): dictionaries with keys: | ctags (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1 | - **id** (bytes): sha1 | ||||
- **ctags** ([list): List of dictionary with keys: name, kind, | - **ctags** ([list): List of dictionary with keys: name, kind, | ||||
line, lang | line, lang | ||||
""" | """ | ||||
_check_duplicates(ctags, 'id') | _check_id_duplicates(ctags) | ||||
ctags.sort(key=lambda m: m['id']) | ctags.sort(key=lambda m: m['id']) | ||||
def _convert_ctags(__ctags): | def _convert_ctags(__ctags): | ||||
"""Convert ctags dict to list of ctags. | """Convert ctags dict to list of ctags. | ||||
""" | """ | ||||
for ctags in __ctags: | for ctags in __ctags: | ||||
yield from converters.ctags_to_db(ctags) | yield from converters.ctags_to_db(ctags) | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | def content_fossology_license_add(self, licenses, conflict_update=False, | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
Returns: | Returns: | ||||
list: content_license entries which failed due to unknown licenses | list: content_license entries which failed due to unknown licenses | ||||
""" | """ | ||||
_check_duplicates(licenses, 'id') | _check_id_duplicates(licenses) | ||||
licenses.sort(key=lambda m: m['id']) | licenses.sort(key=lambda m: m['id']) | ||||
db.mktemp_content_fossology_license(cur) | db.mktemp_content_fossology_license(cur) | ||||
db.copy_to( | db.copy_to( | ||||
({ | ({ | ||||
'id': sha1['id'], | 'id': sha1['id'], | ||||
'indexer_configuration_id': sha1['indexer_configuration_id'], | 'indexer_configuration_id': sha1['indexer_configuration_id'], | ||||
'license': license, | 'license': license, | ||||
} for sha1 in licenses | } for sha1 in licenses | ||||
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines | def content_metadata_get(self, ids, db=None, cur=None): | ||||
Args: | Args: | ||||
ids (iterable): sha1 checksums | ids (iterable): sha1 checksums | ||||
Yields: | Yields: | ||||
dictionaries with the following keys: | dictionaries with the following keys: | ||||
id (bytes) | id (bytes) | ||||
translated_metadata (str): associated metadata | metadata (str): associated metadata | ||||
tool (dict): tool used to compute metadata | tool (dict): tool used to compute metadata | ||||
""" | """ | ||||
for c in db.content_metadata_get_from_list(ids, cur): | for c in db.content_metadata_get_from_list(ids, cur): | ||||
yield converters.db_to_metadata( | yield converters.db_to_metadata( | ||||
dict(zip(db.content_metadata_cols, c))) | dict(zip(db.content_metadata_cols, c))) | ||||
@remote_api_endpoint('content_metadata/add') | @remote_api_endpoint('content_metadata/add') | ||||
@db_transaction() | @db_transaction() | ||||
def content_metadata_add(self, metadata, conflict_update=False, db=None, | def content_metadata_add(self, metadata, conflict_update=False, db=None, | ||||
cur=None): | cur=None): | ||||
"""Add metadata not present in storage. | """Add metadata not present in storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id**: sha1 | - **id**: sha1 | ||||
- **translated_metadata**: arbitrary dict | - **metadata**: arbitrary dict | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
_check_duplicates(metadata, 'id') | _check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m['id']) | metadata.sort(key=lambda m: m['id']) | ||||
db.mktemp_content_metadata(cur) | db.mktemp_content_metadata(cur) | ||||
db.copy_to(metadata, 'tmp_content_metadata', | db.copy_to(metadata, 'tmp_content_metadata', | ||||
['id', 'translated_metadata', 'indexer_configuration_id'], | ['id', 'metadata', 'indexer_configuration_id'], | ||||
cur) | cur) | ||||
db.content_metadata_add_from_temp(conflict_update, cur) | db.content_metadata_add_from_temp(conflict_update, cur) | ||||
@remote_api_endpoint('revision_metadata/missing') | @remote_api_endpoint('revision_intrinsic_metadata/missing') | ||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def revision_metadata_missing(self, metadata, db=None, cur=None): | def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): | ||||
"""List metadata missing from storage. | """List metadata missing from storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1_git revision identifier | - **id** (bytes): sha1_git revision identifier | ||||
- **indexer_configuration_id** (int): tool used to compute | - **indexer_configuration_id** (int): tool used to compute | ||||
the results | the results | ||||
Yields: | Yields: | ||||
missing ids | missing ids | ||||
""" | """ | ||||
for obj in db.revision_metadata_missing_from_list(metadata, cur): | for obj in db.revision_intrinsic_metadata_missing_from_list( | ||||
metadata, cur): | |||||
yield obj[0] | yield obj[0] | ||||
@remote_api_endpoint('revision_metadata') | @remote_api_endpoint('revision_intrinsic_metadata') | ||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def revision_metadata_get(self, ids, db=None, cur=None): | def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): | ||||
"""Retrieve revision metadata per id. | """Retrieve revision metadata per id. | ||||
Args: | Args: | ||||
ids (iterable): sha1 checksums | ids (iterable): sha1 checksums | ||||
Yields: | Yields: | ||||
dictionaries with the following keys: | dictionaries with the following keys: | ||||
- **id** (bytes) | - **id** (bytes) | ||||
- **translated_metadata** (str): associated metadata | - **metadata** (str): associated metadata | ||||
- **tool** (dict): tool used to compute metadata | - **tool** (dict): tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
""" | """ | ||||
for c in db.revision_metadata_get_from_list(ids, cur): | for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): | ||||
yield converters.db_to_metadata( | yield converters.db_to_metadata( | ||||
dict(zip(db.revision_metadata_cols, c))) | dict(zip(db.revision_intrinsic_metadata_cols, c))) | ||||
@remote_api_endpoint('revision_metadata/add') | @remote_api_endpoint('revision_intrinsic_metadata/add') | ||||
@db_transaction() | @db_transaction() | ||||
def revision_metadata_add(self, metadata, conflict_update=False, db=None, | def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, | ||||
cur=None): | db=None, cur=None): | ||||
"""Add metadata not present in storage. | """Add metadata not present in storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id**: sha1_git of revision | - **id**: sha1_git of revision | ||||
- **translated_metadata**: arbitrary dict | - **metadata**: arbitrary dict | ||||
- **indexer_configuration_id**: tool used to compute metadata | - **indexer_configuration_id**: tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
_check_duplicates(metadata, 'id') | _check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m['id']) | metadata.sort(key=lambda m: m['id']) | ||||
db.mktemp_revision_metadata(cur) | db.mktemp_revision_intrinsic_metadata(cur) | ||||
db.copy_to(metadata, 'tmp_revision_metadata', | db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', | ||||
['id', 'translated_metadata', 'mappings', | ['id', 'metadata', 'mappings', | ||||
'indexer_configuration_id'], | 'indexer_configuration_id'], | ||||
cur) | cur) | ||||
db.revision_metadata_add_from_temp(conflict_update, cur) | db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) | ||||
@remote_api_endpoint('revision_metadata/delete') | @remote_api_endpoint('revision_intrinsic_metadata/delete') | ||||
@db_transaction() | @db_transaction() | ||||
def revision_metadata_delete(self, entries, db=None, cur=None): | def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): | ||||
"""Remove revision metadata from the storage. | """Remove revision metadata from the storage. | ||||
Args: | Args: | ||||
entries (dict): dictionaries with the following keys: | entries (dict): dictionaries with the following keys: | ||||
- **id** (bytes): revision identifier | - **id** (bytes): revision identifier | ||||
- **indexer_configuration_id** (int): tool used to compute | - **indexer_configuration_id** (int): tool used to compute | ||||
metadata | metadata | ||||
""" | """ | ||||
db.revision_metadata_delete(entries, cur) | db.revision_intrinsic_metadata_delete(entries, cur) | ||||
@remote_api_endpoint('origin_intrinsic_metadata') | @remote_api_endpoint('origin_intrinsic_metadata') | ||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): | def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): | ||||
"""Retrieve origin metadata per id. | """Retrieve origin metadata per id. | ||||
Args: | Args: | ||||
ids (iterable): origin identifiers | ids (iterable): origin identifiers | ||||
Yields: | Yields: | ||||
list: dictionaries with the following keys: | list: dictionaries with the following keys: | ||||
- **origin_id** (int) | - **id** (int) | ||||
- **metadata** (str): associated metadata | - **metadata** (str): associated metadata | ||||
- **tool** (dict): tool used to compute metadata | - **tool** (dict): tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
""" | """ | ||||
for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): | for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): | ||||
yield converters.db_to_metadata( | yield converters.db_to_metadata( | ||||
dict(zip(db.origin_intrinsic_metadata_cols, c))) | dict(zip(db.origin_intrinsic_metadata_cols, c))) | ||||
@remote_api_endpoint('origin_intrinsic_metadata/add') | @remote_api_endpoint('origin_intrinsic_metadata/add') | ||||
@db_transaction() | @db_transaction() | ||||
def origin_intrinsic_metadata_add(self, metadata, | def origin_intrinsic_metadata_add(self, metadata, | ||||
conflict_update=False, db=None, | conflict_update=False, db=None, | ||||
cur=None): | cur=None): | ||||
"""Add origin metadata not present in storage. | """Add origin metadata not present in storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **origin_id**: origin identifier | - **id**: origin identifier | ||||
- **from_revision**: sha1 id of the revision used to generate | - **from_revision**: sha1 id of the revision used to generate | ||||
these metadata. | these metadata. | ||||
- **metadata**: arbitrary dict | - **metadata**: arbitrary dict | ||||
- **indexer_configuration_id**: tool used to compute metadata | - **indexer_configuration_id**: tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
_check_duplicates(metadata, 'origin_id') | _check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m['origin_id']) | metadata.sort(key=lambda m: m['id']) | ||||
vlorentz: You can change `_check_duplicates` to remove the second argument, it's no longer needed now. | |||||
db.mktemp_origin_intrinsic_metadata(cur) | db.mktemp_origin_intrinsic_metadata(cur) | ||||
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', | db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', | ||||
['origin_id', 'metadata', 'indexer_configuration_id', | ['id', 'metadata', 'indexer_configuration_id', | ||||
'from_revision', 'mappings'], | 'from_revision', 'mappings'], | ||||
cur) | cur) | ||||
db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) | db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) | ||||
@remote_api_endpoint('origin_intrinsic_metadata/delete') | @remote_api_endpoint('origin_intrinsic_metadata/delete') | ||||
@db_transaction() | @db_transaction() | ||||
def origin_intrinsic_metadata_delete( | def origin_intrinsic_metadata_delete( | ||||
self, entries, db=None, cur=None): | self, entries, db=None, cur=None): | ||||
▲ Show 20 Lines • Show All 175 Lines • Show Last 20 Lines |
You can change _check_duplicates to remove the second argument, it's no longer needed now.