Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import click | import click | ||||
import logging | import logging | ||||
from swh.indexer.indexer import ContentIndexer, RevisionIndexer | from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | ||||
from swh.indexer.metadata_dictionary import compute_metadata | from swh.indexer.metadata_dictionary import compute_metadata | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.metadata_detector import extract_minimal_metadata_dict | from swh.indexer.metadata_detector import extract_minimal_metadata_dict | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
▲ Show 20 Lines • Show All 244 Lines • ▼ Show 20 Lines | def translate_revision_metadata(self, detected_files): | ||||
except Exception as e: | except Exception as e: | ||||
self.log.warn("""Exception while indexing content""", e) | self.log.warn("""Exception while indexing content""", e) | ||||
# transform translated_metadata into min set with swh-metadata-detector | # transform translated_metadata into min set with swh-metadata-detector | ||||
min_metadata = extract_minimal_metadata_dict(translated_metadata) | min_metadata = extract_minimal_metadata_dict(translated_metadata) | ||||
return min_metadata | return min_metadata | ||||
class OriginMetadataIndexer(OriginIndexer): | |||||
def filter(self, ids): | |||||
return ids | |||||
def run(self, revisions_metadata, policy_update, *, origin_head_pairs): | |||||
moranegg: Can you add explicit docs string about the arguments:
- revisions_metadata
- origin_head_pairs | |||||
"""Expected to be called with the result of RevisionMetadataIndexer | |||||
as first argument; ie. not a list of ids as other indexers would.""" | |||||
origin_head_map = {pair['origin_id']: pair['revision_id'] | |||||
for pair in origin_head_pairs} | |||||
return super().run(ids=list(origin_head_map), | |||||
policy_update=policy_update, | |||||
revisions_metadata=revisions_metadata, | |||||
Not Done Inline Actionsseems a bit weird that the ids in this current run method are revision_ids more human comments about this would be nice :-) moranegg: seems a bit weird that the ids in this current run method are revision_ids
and the ones sent… | |||||
Done Inline Actions
They are not revision ids, they are the metadata itself. That's because they come from a previous Celery task, and Celery forces it to be a first argument... :/
Sure, will do vlorentz: > the ids in this current run method are revision_ids
They are not revision ids, they are the… | |||||
origin_head_map=origin_head_map) | |||||
def index(self, origin, *, revisions_metadata, origin_head_map): | |||||
revision_id = origin_head_map[origin['id']] | |||||
for revision_metadata in revisions_metadata: | |||||
if revision_metadata['id'] == revision_id: | |||||
return revision_metadata | |||||
# If you get this KeyError with a message like this: | |||||
# 'foo' not in [b'foo'] | |||||
# you should check you're not using JSON as task serializer | |||||
raise KeyError('%r not in %r' % | |||||
(revision_id, [r['id'] for r in revisions_metadata])) | |||||
def persist_index_computations(self, results, policy_update): | |||||
self.idx_storage.origin_intrinsic_metadata_add( | |||||
results, conflict_update=(policy_update == 'update-dups')) | |||||
@click.command() | @click.command() | ||||
@click.option('--revs', '-i', | @click.option('--revs', '-i', | ||||
help='Default sha1_git to lookup', multiple=True) | help='Default sha1_git to lookup', multiple=True) | ||||
def main(revs): | def main(revs): | ||||
_git_sha1s = list(map(hashutil.hash_to_bytes, revs)) | _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) | ||||
rev_metadata_indexer = RevisionMetadataIndexer() | rev_metadata_indexer = RevisionMetadataIndexer() | ||||
rev_metadata_indexer.run(_git_sha1s, 'update-dups') | rev_metadata_indexer.run(_git_sha1s, 'update-dups') | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
main() | main() |
Can you add explicit docs string about the arguments: