Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import click | import click | ||||
import itertools | import itertools | ||||
import logging | import logging | ||||
from copy import deepcopy | from copy import deepcopy | ||||
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | ||||
from swh.indexer.origin_head import OriginHeadIndexer | |||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.metadata_detector import extract_minimal_metadata_dict | from swh.indexer.metadata_detector import extract_minimal_metadata_dict | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
▲ Show 20 Lines • Show All 112 Lines • ▼ Show 20 Lines | def index(self, rev): | ||||
"""Index rev by processing it and organizing result. | """Index rev by processing it and organizing result. | ||||
use metadata_detector to iterate on filenames | use metadata_detector to iterate on filenames | ||||
- if one filename detected -> sends file to content indexer | - if one filename detected -> sends file to content indexer | ||||
- if multiple file detected -> translation needed at revision level | - if multiple file detected -> translation needed at revision level | ||||
Args: | Args: | ||||
rev (bytes): revision artifact from storage | rev (dict): revision artifact from storage | ||||
Returns: | Returns: | ||||
dict: dictionary representing a revision_metadata, with keys: | dict: dictionary representing a revision_metadata, with keys: | ||||
- id (str): rev's identifier (sha1_git) | - id (str): rev's identifier (sha1_git) | ||||
- indexer_configuration_id (bytes): tool used | - indexer_configuration_id (bytes): tool used | ||||
- translated_metadata: dict of retrieved metadata | - translated_metadata: dict of retrieved metadata | ||||
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines | def index(self, origin, *, origin_head_map): | ||||
return results | return results | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations(self, results, policy_update): | ||||
self.idx_storage.origin_intrinsic_metadata_add( | self.idx_storage.origin_intrinsic_metadata_add( | ||||
list(itertools.chain(*results)), | list(itertools.chain(*results)), | ||||
conflict_update=(policy_update == 'update-dups')) | conflict_update=(policy_update == 'update-dups')) | ||||
class FullOriginMetadataIndexer(OriginIndexer): | |||||
CONFIG_BASE_FILENAME = 'indexer/full_origin_intrinsic_metadata' | |||||
ADDITIONAL_CONFIG = { | |||||
'tools': ('list', []) | |||||
} | |||||
USE_TOOLS = False | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.origin_head_indexer = OriginHeadIndexer() | |||||
self.revision_metadata_indexer = RevisionMetadataIndexer() | |||||
douardda: why not overload the prepare() method? | |||||
Done Inline ActionsSo tests can override just this part without reimplementing the whole prepare(). Though it should eventually go away vlorentz: So tests can override just this part without reimplementing the whole `prepare()`. Though it… | |||||
Done Inline ActionsDid I miss something? I do not see this method being overloaded in the tests below. douardda: Did I miss something? I do not see this method being overloaded in the tests below.
| |||||
def index(self, origin): | |||||
head_result = self.origin_head_indexer.index(origin) | |||||
if not head_result: | |||||
Done Inline Actionskill this method and add a USE_TOOLS = False class attribute, see D990 douardda: kill this method and add a USE_TOOLS = False class attribute, see D990 | |||||
return | |||||
rev_id = head_result['revision_id'] | |||||
rev = list(self.storage.revision_get([rev_id])) | |||||
Done Inline ActionsShould not be required any more, see D990 douardda: Should not be required any more, see D990 | |||||
if not rev: | |||||
self.warning('Missing head revision %s of origin %r', | |||||
(hashutil.hash_to_bytes(rev_id), origin)) | |||||
return | |||||
assert len(rev) == 1 | |||||
rev = rev[0] | |||||
rev_metadata = self.revision_metadata_indexer.index(rev) | |||||
orig_metadata = { | |||||
'from_revision': rev_metadata['id'], | |||||
'origin_id': origin['id'], | |||||
'metadata': rev_metadata['translated_metadata'], | |||||
'indexer_configuration_id': | |||||
rev_metadata['indexer_configuration_id'], | |||||
} | |||||
return (orig_metadata, rev_metadata) | |||||
def persist_index_computations(self, results, policy_update): | |||||
self.idx_storage.revision_metadata_add( | |||||
[rev_item for (orig_item, rev_item) in results], | |||||
conflict_update=(policy_update == 'update-dups')) | |||||
self.idx_storage.origin_intrinsic_metadata_add( | |||||
[orig_item for (orig_item, rev_item) in results], | |||||
conflict_update=(policy_update == 'update-dups')) | |||||
@click.command() | @click.command() | ||||
@click.option('--revs', '-i', | @click.option('--revs', '-i', | ||||
help='Default sha1_git to lookup', multiple=True) | help='Default sha1_git to lookup', multiple=True) | ||||
def main(revs): | def main(revs): | ||||
_git_sha1s = list(map(hashutil.hash_to_bytes, revs)) | _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) | ||||
rev_metadata_indexer = RevisionMetadataIndexer() | rev_metadata_indexer = RevisionMetadataIndexer() | ||||
rev_metadata_indexer.run(_git_sha1s, 'update-dups') | rev_metadata_indexer.run(_git_sha1s, 'update-dups') | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
main() | main() |
why not overload the prepare() method?