Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/__init__.py
Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | elif cls == 'local': | ||||
from . import IndexerStorage | from . import IndexerStorage | ||||
elif cls == 'memory': | elif cls == 'memory': | ||||
from .in_memory import IndexerStorage | from .in_memory import IndexerStorage | ||||
else: | else: | ||||
raise ValueError('Unknown indexer storage class `%s`' % cls) | raise ValueError('Unknown indexer storage class `%s`' % cls) | ||||
return IndexerStorage(**args) | return IndexerStorage(**args) | ||||
def _check_duplicates(data, key): | |||||
""" | |||||
If any two dictionaries in `data` have the same value for the | |||||
key, raises a `ValueError`. | |||||
Values associated to the key must be hashable. | |||||
Args: | |||||
data (List[dict]): List of dictionaries to be inserted | |||||
key (str): Name of the key that acts as id. | |||||
>>> _check_duplicates([ | |||||
... {'id': 'foo', 'data': 'spam'}, | |||||
... {'id': 'bar', 'data': 'egg'}, | |||||
... ], 'id') | |||||
>>> _check_duplicates([ | |||||
... {'id': 'foo', 'data': 'spam'}, | |||||
... {'id': 'foo', 'data': 'egg'}, | |||||
... ], 'id') | |||||
Traceback (most recent call last): | |||||
... | |||||
ValueError: The same id is present more than once. | |||||
""" | |||||
if len({item[key] for item in data}) < len(data): | |||||
raise ValueError( | |||||
'The same {} is present more than once.'.format(key)) | |||||
douardda: I'm still quite not happy with this for several reasons:
- the arg name 'column' make no sense… | |||||
Done Inline Actions
Indeed, fixed.
It's in the module that implements the pg backend and its name is prefixed with _, so it's unlikely to be used anywhere else. Fixed anyway.
There's the doctest
Indeed, fixed. vlorentz: > the arg name 'column' make no sense in the context of checking stuff on dictionnaries… | |||||
class IndexerStorage: | class IndexerStorage: | ||||
"""SWH Indexer Storage | """SWH Indexer Storage | ||||
""" | """ | ||||
def __init__(self, db, min_pool_conns=1, max_pool_conns=10): | def __init__(self, db, min_pool_conns=1, max_pool_conns=10): | ||||
""" | """ | ||||
Args: | Args: | ||||
db_conn: either a libpq connection string, or a psycopg2 connection | db_conn: either a libpq connection string, or a psycopg2 connection | ||||
▲ Show 20 Lines • Show All 151 Lines • ▼ Show 20 Lines | def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, | ||||
- **encoding** (bytes): raw content's encoding | - **encoding** (bytes): raw content's encoding | ||||
- **indexer_configuration_id** (int): tool's id used to | - **indexer_configuration_id** (int): tool's id used to | ||||
compute the results | compute the results | ||||
- **conflict_update** (bool): Flag to determine if we want to | - **conflict_update** (bool): Flag to determine if we want to | ||||
overwrite (``True``) or skip duplicates (``False``, the | overwrite (``True``) or skip duplicates (``False``, the | ||||
default) | default) | ||||
""" | """ | ||||
_check_duplicates(mimetypes, 'id') | |||||
db.mktemp_content_mimetype(cur) | db.mktemp_content_mimetype(cur) | ||||
db.copy_to(mimetypes, 'tmp_content_mimetype', | db.copy_to(mimetypes, 'tmp_content_mimetype', | ||||
['id', 'mimetype', 'encoding', 'indexer_configuration_id'], | ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], | ||||
cur) | cur) | ||||
db.content_mimetype_add_from_temp(conflict_update, cur) | db.content_mimetype_add_from_temp(conflict_update, cur) | ||||
@remote_api_endpoint('content_mimetype') | @remote_api_endpoint('content_mimetype') | ||||
@db_transaction_generator() | @db_transaction_generator() | ||||
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines | def content_language_add(self, languages, conflict_update=False, db=None, | ||||
- **id** (bytes): sha1 | - **id** (bytes): sha1 | ||||
- **lang** (bytes): language detected | - **lang** (bytes): language detected | ||||
conflict_update (bool): Flag to determine if we want to | conflict_update (bool): Flag to determine if we want to | ||||
overwrite (true) or skip duplicates (false, the | overwrite (true) or skip duplicates (false, the | ||||
default) | default) | ||||
""" | """ | ||||
_check_duplicates(languages, 'id') | |||||
db.mktemp_content_language(cur) | db.mktemp_content_language(cur) | ||||
# empty language is mapped to 'unknown' | # empty language is mapped to 'unknown' | ||||
db.copy_to( | db.copy_to( | ||||
({ | ({ | ||||
'id': l['id'], | 'id': l['id'], | ||||
'lang': 'unknown' if not l['lang'] else l['lang'], | 'lang': 'unknown' if not l['lang'] else l['lang'], | ||||
'indexer_configuration_id': l['indexer_configuration_id'], | 'indexer_configuration_id': l['indexer_configuration_id'], | ||||
} for l in languages), | } for l in languages), | ||||
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines | def content_ctags_add(self, ctags, conflict_update=False, db=None, | ||||
Args: | Args: | ||||
ctags (iterable): dictionaries with keys: | ctags (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1 | - **id** (bytes): sha1 | ||||
- **ctags** ([list): List of dictionary with keys: name, kind, | - **ctags** ([list): List of dictionary with keys: name, kind, | ||||
line, lang | line, lang | ||||
""" | """ | ||||
_check_duplicates(ctags, 'id') | |||||
def _convert_ctags(__ctags): | def _convert_ctags(__ctags): | ||||
"""Convert ctags dict to list of ctags. | """Convert ctags dict to list of ctags. | ||||
""" | """ | ||||
for ctags in __ctags: | for ctags in __ctags: | ||||
yield from converters.ctags_to_db(ctags) | yield from converters.ctags_to_db(ctags) | ||||
db.mktemp_content_ctags(cur) | db.mktemp_content_ctags(cur) | ||||
▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines | def content_fossology_license_add(self, licenses, conflict_update=False, | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
Returns: | Returns: | ||||
list: content_license entries which failed due to unknown licenses | list: content_license entries which failed due to unknown licenses | ||||
""" | """ | ||||
# Then, we add the correct ones | _check_duplicates(licenses, 'id') | ||||
db.mktemp_content_fossology_license(cur) | db.mktemp_content_fossology_license(cur) | ||||
db.copy_to( | db.copy_to( | ||||
({ | ({ | ||||
'id': sha1['id'], | 'id': sha1['id'], | ||||
'indexer_configuration_id': sha1['indexer_configuration_id'], | 'indexer_configuration_id': sha1['indexer_configuration_id'], | ||||
'license': license, | 'license': license, | ||||
} for sha1 in licenses | } for sha1 in licenses | ||||
for license in sha1['licenses']), | for license in sha1['licenses']), | ||||
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines | def content_metadata_add(self, metadata, conflict_update=False, db=None, | ||||
- **id**: sha1 | - **id**: sha1 | ||||
- **translated_metadata**: arbitrary dict | - **translated_metadata**: arbitrary dict | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
_check_duplicates(metadata, 'id') | |||||
db.mktemp_content_metadata(cur) | db.mktemp_content_metadata(cur) | ||||
db.copy_to(metadata, 'tmp_content_metadata', | db.copy_to(metadata, 'tmp_content_metadata', | ||||
['id', 'translated_metadata', 'indexer_configuration_id'], | ['id', 'translated_metadata', 'indexer_configuration_id'], | ||||
cur) | cur) | ||||
db.content_metadata_add_from_temp(conflict_update, cur) | db.content_metadata_add_from_temp(conflict_update, cur) | ||||
@remote_api_endpoint('revision_metadata/missing') | @remote_api_endpoint('revision_metadata/missing') | ||||
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines | def revision_metadata_add(self, metadata, conflict_update=False, db=None, | ||||
- **indexer_configuration_id**: tool used to compute metadata | - **indexer_configuration_id**: tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
_check_duplicates(metadata, 'id') | |||||
db.mktemp_revision_metadata(cur) | db.mktemp_revision_metadata(cur) | ||||
db.copy_to(metadata, 'tmp_revision_metadata', | db.copy_to(metadata, 'tmp_revision_metadata', | ||||
['id', 'translated_metadata', 'mappings', | ['id', 'translated_metadata', 'mappings', | ||||
'indexer_configuration_id'], | 'indexer_configuration_id'], | ||||
cur) | cur) | ||||
db.revision_metadata_add_from_temp(conflict_update, cur) | db.revision_metadata_add_from_temp(conflict_update, cur) | ||||
Show All 36 Lines | def origin_intrinsic_metadata_add(self, metadata, | ||||
- **indexer_configuration_id**: tool used to compute metadata | - **indexer_configuration_id**: tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
_check_duplicates(metadata, 'origin_id') | |||||
db.mktemp_origin_intrinsic_metadata(cur) | db.mktemp_origin_intrinsic_metadata(cur) | ||||
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', | db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', | ||||
['origin_id', 'metadata', 'indexer_configuration_id', | ['origin_id', 'metadata', 'indexer_configuration_id', | ||||
'from_revision', 'mappings'], | 'from_revision', 'mappings'], | ||||
cur) | cur) | ||||
db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) | db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) | ||||
▲ Show 20 Lines • Show All 127 Lines • Show Last 20 Lines |
I'm still quite not happy with this for several reasons: