Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/in_memory.py
Show First 20 Lines • Show All 190 Lines • ▼ Show 20 Lines | class IndexerStorage: | ||||
def __init__(self): | def __init__(self): | ||||
self._tools = {} | self._tools = {} | ||||
self._mimetypes = SubStorage(self._tools) | self._mimetypes = SubStorage(self._tools) | ||||
self._languages = SubStorage(self._tools) | self._languages = SubStorage(self._tools) | ||||
self._content_ctags = SubStorage(self._tools) | self._content_ctags = SubStorage(self._tools) | ||||
self._licenses = SubStorage(self._tools) | self._licenses = SubStorage(self._tools) | ||||
self._content_metadata = SubStorage(self._tools) | self._content_metadata = SubStorage(self._tools) | ||||
self._revision_metadata = SubStorage(self._tools) | self._revision_intrinsic_metadata = SubStorage(self._tools) | ||||
self._origin_intrinsic_metadata = SubStorage(self._tools) | self._origin_intrinsic_metadata = SubStorage(self._tools) | ||||
def content_mimetype_missing(self, mimetypes): | def content_mimetype_missing(self, mimetypes): | ||||
"""Generate mimetypes missing from storage. | """Generate mimetypes missing from storage. | ||||
Args: | Args: | ||||
mimetypes (iterable): iterable of dict with keys: | mimetypes (iterable): iterable of dict with keys: | ||||
▲ Show 20 Lines • Show All 300 Lines • ▼ Show 20 Lines | def content_metadata_get(self, ids): | ||||
Args: | Args: | ||||
ids (iterable): sha1 checksums | ids (iterable): sha1 checksums | ||||
Yields: | Yields: | ||||
dictionaries with the following keys: | dictionaries with the following keys: | ||||
- **id** (bytes) | - **id** (bytes) | ||||
- **translated_metadata** (str): associated metadata | - **metadata** (str): associated metadata | ||||
- **tool** (dict): tool used to compute metadata | - **tool** (dict): tool used to compute metadata | ||||
""" | """ | ||||
yield from self._content_metadata.get(ids) | yield from self._content_metadata.get(ids) | ||||
def content_metadata_add(self, metadata, conflict_update=False): | def content_metadata_add(self, metadata, conflict_update=False): | ||||
"""Add metadata not present in storage. | """Add metadata not present in storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id**: sha1 | - **id**: sha1 | ||||
- **translated_metadata**: arbitrary dict | - **metadata**: arbitrary dict | ||||
- **indexer_configuration_id**: tool used to compute the | - **indexer_configuration_id**: tool used to compute the | ||||
results | results | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
if not all(isinstance(x['id'], bytes) for x in metadata): | if not all(isinstance(x['id'], bytes) for x in metadata): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._content_metadata.add(metadata, conflict_update) | self._content_metadata.add(metadata, conflict_update) | ||||
def revision_metadata_missing(self, metadata): | def revision_intrinsic_metadata_missing(self, metadata): | ||||
"""List metadata missing from storage. | """List metadata missing from storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1_git revision identifier | - **id** (bytes): sha1_git revision identifier | ||||
- **indexer_configuration_id** (int): tool used to compute | - **indexer_configuration_id** (int): tool used to compute | ||||
the results | the results | ||||
Yields: | Yields: | ||||
missing ids | missing ids | ||||
""" | """ | ||||
yield from self._revision_metadata.missing(metadata) | yield from self._revision_intrinsic_metadata.missing(metadata) | ||||
def revision_metadata_get(self, ids): | def revision_intrinsic_metadata_get(self, ids): | ||||
"""Retrieve revision metadata per id. | """Retrieve revision metadata per id. | ||||
Args: | Args: | ||||
ids (iterable): sha1 checksums | ids (iterable): sha1 checksums | ||||
Yields: | Yields: | ||||
dictionaries with the following keys: | dictionaries with the following keys: | ||||
- **id** (bytes) | - **id** (bytes) | ||||
- **translated_metadata** (str): associated metadata | - **metadata** (str): associated metadata | ||||
- **tool** (dict): tool used to compute metadata | - **tool** (dict): tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
""" | """ | ||||
yield from self._revision_metadata.get(ids) | yield from self._revision_intrinsic_metadata.get(ids) | ||||
def revision_metadata_add(self, metadata, conflict_update=False): | def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): | ||||
"""Add metadata not present in storage. | """Add metadata not present in storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id**: sha1_git of revision | - **id**: sha1_git of revision | ||||
- **translated_metadata**: arbitrary dict | - **metadata**: arbitrary dict | ||||
- **indexer_configuration_id**: tool used to compute metadata | - **indexer_configuration_id**: tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
if not all(isinstance(x['id'], bytes) for x in metadata): | if not all(isinstance(x['id'], bytes) for x in metadata): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._revision_metadata.add(metadata, conflict_update) | self._revision_intrinsic_metadata.add(metadata, conflict_update) | ||||
def revision_metadata_delete(self, entries): | def revision_intrinsic_metadata_delete(self, entries): | ||||
"""Remove revision metadata from the storage. | """Remove revision metadata from the storage. | ||||
Args: | Args: | ||||
entries (dict): dictionaries with the following keys: | entries (dict): dictionaries with the following keys: | ||||
- **revision** (int): origin identifier | - **revision** (int): origin identifier | ||||
- **id** (int): tool used to compute metadata | - **id** (int): tool used to compute metadata | ||||
""" | """ | ||||
self._revision_metadata.delete(entries) | self._revision_intrinsic_metadata.delete(entries) | ||||
def origin_intrinsic_metadata_get(self, ids): | def origin_intrinsic_metadata_get(self, ids): | ||||
"""Retrieve origin metadata per id. | """Retrieve origin metadata per id. | ||||
Args: | Args: | ||||
ids (iterable): origin identifiers | ids (iterable): origin identifiers | ||||
Yields: | Yields: | ||||
list: dictionaries with the following keys: | list: dictionaries with the following keys: | ||||
- **origin_id** (int) | - **id** (int) | ||||
- **translated_metadata** (str): associated metadata | - **metadata** (str): associated metadata | ||||
- **tool** (dict): tool used to compute metadata | - **tool** (dict): tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
""" | """ | ||||
for item in self._origin_intrinsic_metadata.get(ids): | yield from self._origin_intrinsic_metadata.get(ids) | ||||
vlorentz: That line (and other similar ones) can be dropped now | |||||
Done Inline ActionsSorry, one more nitpick: the for loop can be replaced by: yield from self._origin_intrinsic_metadata.get(ids) vlorentz: Sorry, one more nitpick: the for loop can be replaced by: `yield from self. | |||||
item['origin_id'] = item.pop('id') | |||||
yield item | |||||
def origin_intrinsic_metadata_add(self, metadata, | def origin_intrinsic_metadata_add(self, metadata, | ||||
conflict_update=False): | conflict_update=False): | ||||
"""Add origin metadata not present in storage. | """Add origin metadata not present in storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **origin_id**: origin identifier | - **id**: origin identifier | ||||
- **from_revision**: sha1 id of the revision used to generate | - **from_revision**: sha1 id of the revision used to generate | ||||
these metadata. | these metadata. | ||||
- **metadata**: arbitrary dict | - **metadata**: arbitrary dict | ||||
- **indexer_configuration_id**: tool used to compute metadata | - **indexer_configuration_id**: tool used to compute metadata | ||||
- **mappings** (List[str]): list of mappings used to translate | - **mappings** (List[str]): list of mappings used to translate | ||||
these metadata | these metadata | ||||
conflict_update: Flag to determine if we want to overwrite (true) | conflict_update: Flag to determine if we want to overwrite (true) | ||||
or skip duplicates (false, the default) | or skip duplicates (false, the default) | ||||
""" | """ | ||||
self._origin_intrinsic_metadata.add(metadata, conflict_update) | |||||
items = [] | |||||
for item in metadata: | |||||
item = item.copy() | |||||
item['id'] = item.pop('origin_id') | |||||
items.append(item) | |||||
self._origin_intrinsic_metadata.add(items, conflict_update) | |||||
def origin_intrinsic_metadata_delete(self, entries): | def origin_intrinsic_metadata_delete(self, entries): | ||||
"""Remove origin metadata from the storage. | """Remove origin metadata from the storage. | ||||
Args: | Args: | ||||
entries (dict): dictionaries with the following keys: | entries (dict): dictionaries with the following keys: | ||||
- **origin_id** (int): origin identifier | - **id** (int): origin identifier | ||||
- **indexer_configuration_id** (int): tool used to compute | - **indexer_configuration_id** (int): tool used to compute | ||||
metadata | metadata | ||||
""" | """ | ||||
items = [] | self._origin_intrinsic_metadata.delete(entries) | ||||
for entry in entries: | |||||
item = entry.copy() | |||||
item['id'] = item.pop('origin_id') | |||||
items.append(item) | |||||
self._origin_intrinsic_metadata.delete(items) | |||||
def origin_intrinsic_metadata_search_fulltext( | def origin_intrinsic_metadata_search_fulltext( | ||||
self, conjunction, limit=100): | self, conjunction, limit=100): | ||||
"""Returns the list of origins whose metadata contain all the terms. | """Returns the list of origins whose metadata contain all the terms. | ||||
Args: | Args: | ||||
conjunction (List[str]): List of terms to be searched for. | conjunction (List[str]): List of terms to be searched for. | ||||
limit (int): The maximum number of results to return | limit (int): The maximum number of results to return | ||||
Show All 32 Lines | def origin_intrinsic_metadata_search_fulltext( | ||||
return score / math.log(len(text_tokens)) | return score / math.log(len(text_tokens)) | ||||
results = [(rank(data), data) | results = [(rank(data), data) | ||||
for data in self._origin_intrinsic_metadata.get_all()] | for data in self._origin_intrinsic_metadata.get_all()] | ||||
results = [(rank_, data) for (rank_, data) in results if rank_ > 0] | results = [(rank_, data) for (rank_, data) in results if rank_ > 0] | ||||
results.sort(key=operator.itemgetter(0), # Don't try to order 'data' | results.sort(key=operator.itemgetter(0), # Don't try to order 'data' | ||||
reverse=True) | reverse=True) | ||||
for (rank_, result) in results[:limit]: | for (rank_, result) in results[:limit]: | ||||
result = result.copy() | |||||
result['origin_id'] = result.pop('id') | |||||
yield result | yield result | ||||
def origin_intrinsic_metadata_search_by_producer( | def origin_intrinsic_metadata_search_by_producer( | ||||
self, start=0, end=None, limit=100, ids_only=False, | self, start=0, end=None, limit=100, ids_only=False, | ||||
mappings=None, tool_ids=None, | mappings=None, tool_ids=None, | ||||
db=None, cur=None): | db=None, cur=None): | ||||
"""Returns the list of origins whose metadata contain all the terms. | """Returns the list of origins whose metadata contain all the terms. | ||||
Show All 29 Lines | def origin_intrinsic_metadata_search_by_producer( | ||||
return | return | ||||
if mappings is not None and mappings.isdisjoint(entry['mappings']): | if mappings is not None and mappings.isdisjoint(entry['mappings']): | ||||
continue | continue | ||||
if tool_ids is not None and entry['tool']['id'] not in tool_ids: | if tool_ids is not None and entry['tool']['id'] not in tool_ids: | ||||
continue | continue | ||||
if ids_only: | if ids_only: | ||||
yield entry['id'] | yield entry['id'] | ||||
else: | else: | ||||
entry = entry.copy() | |||||
entry['origin_id'] = entry.pop('id') | |||||
yield entry | yield entry | ||||
nb_results += 1 | nb_results += 1 | ||||
def origin_intrinsic_metadata_stats(self): | def origin_intrinsic_metadata_stats(self): | ||||
"""Returns statistics on stored intrinsic metadata. | """Returns statistics on stored intrinsic metadata. | ||||
Returns: | Returns: | ||||
dict: dictionary with keys: | dict: dictionary with keys: | ||||
▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines |
That line (and other similar ones) can be dropped now