Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import click | import click | ||||
import logging | import logging | ||||
from swh.indexer.indexer import ContentIndexer, RevisionIndexer | from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | ||||
from swh.indexer.metadata_dictionary import compute_metadata | from swh.indexer.metadata_dictionary import compute_metadata | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.metadata_detector import extract_minimal_metadata_dict | from swh.indexer.metadata_detector import extract_minimal_metadata_dict | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
▲ Show 20 Lines • Show All 244 Lines • ▼ Show 20 Lines | def translate_revision_metadata(self, detected_files): | ||||
except Exception as e: | except Exception as e: | ||||
self.log.warn("""Exception while indexing content""", e) | self.log.warn("""Exception while indexing content""", e) | ||||
# transform translated_metadata into min set with swh-metadata-detector | # transform translated_metadata into min set with swh-metadata-detector | ||||
min_metadata = extract_minimal_metadata_dict(translated_metadata) | min_metadata = extract_minimal_metadata_dict(translated_metadata) | ||||
return min_metadata | return min_metadata | ||||
class OriginMetadataIndexer(OriginIndexer): | |||||
def filter(self, ids): | |||||
return ids | |||||
def run(self, revisions_metadata, policy_update, *, origin_head_pairs): | |||||
"""Expected to be called with the result of RevisionMetadataIndexer | |||||
as first argument; ie. not a list of ids as other indexers would. | |||||
Args: | |||||
* `revisions_metadata` (List[dict]): contains metadata from | |||||
revisions, along with the respective revision ids. It is | |||||
passed by RevisionMetadataIndexer via a Celery chain | |||||
triggered by OriginIndexer.next_step. | |||||
* `policy_update`: `'ignore-dups'` or `'update-dups'` | |||||
* `origin_head_pairs` (List[dict]): list of dictionaries with | |||||
keys `origin_id` and `revision_id`, which is the result | |||||
of OriginHeadIndexer. | |||||
""" | |||||
origin_head_map = {pair['origin_id']: pair['revision_id'] | |||||
for pair in origin_head_pairs} | |||||
# Fix up the argument order. revisions_metadata has to be the | |||||
# first argument because of celery.chain; the next line calls | |||||
# run() with the usual order, ie. origin ids first. | |||||
return super().run(ids=list(origin_head_map), | |||||
policy_update=policy_update, | |||||
revisions_metadata=revisions_metadata, | |||||
origin_head_map=origin_head_map) | |||||
moranegg: I'm kinda curious about the argument annotation `*`, could you point me to documentation… | |||||
Done Inline ActionsIt prevents the next arguments from being called as positional arguments, ie. index('foo', 'bar', 'baz') is invalid; you have to use this: index('foo', revisions_metadata='bar', origin_head_map='baz'). That's intended as a way to prevent mistakes that would cause hard-to-find bugs in future versions with a different API. vlorentz: It prevents the next arguments from being called as positional arguments, ie. `index('foo'… | |||||
def index(self, origin, *, revisions_metadata, origin_head_map): | |||||
# Get the last revision of the origin. | |||||
revision_id = origin_head_map[origin['id']] | |||||
# Get the metadata of that revision, and return it | |||||
for revision_metadata in revisions_metadata: | |||||
if revision_metadata['id'] == revision_id: | |||||
return { | |||||
'origin_id': origin['id'], | |||||
'metadata': revision_metadata['translated_metadata'], | |||||
'from_revision': revision_id, | |||||
'indexer_configuration_id': | |||||
Not Done Inline Actionsindentation? moranegg: indentation? | |||||
Done Inline ActionsI too wanted to indent, but flakes8 doesn't agree with us vlorentz: I too wanted to indent, but flakes8 doesn't agree with us | |||||
revision_metadata['indexer_configuration_id'], | |||||
} | |||||
# If you get this KeyError with a message like this: | |||||
# 'foo' not in [b'foo'] | |||||
# you should check you're not using JSON as task serializer | |||||
raise KeyError('%r not in %r' % | |||||
(revision_id, [r['id'] for r in revisions_metadata])) | |||||
def persist_index_computations(self, results, policy_update): | |||||
self.idx_storage.origin_intrinsic_metadata_add( | |||||
results, conflict_update=(policy_update == 'update-dups')) | |||||
@click.command() | @click.command() | ||||
@click.option('--revs', '-i', | @click.option('--revs', '-i', | ||||
help='Default sha1_git to lookup', multiple=True) | help='Default sha1_git to lookup', multiple=True) | ||||
def main(revs): | def main(revs): | ||||
_git_sha1s = list(map(hashutil.hash_to_bytes, revs)) | _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) | ||||
rev_metadata_indexer = RevisionMetadataIndexer() | rev_metadata_indexer = RevisionMetadataIndexer() | ||||
rev_metadata_indexer.run(_git_sha1s, 'update-dups') | rev_metadata_indexer.run(_git_sha1s, 'update-dups') | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
main() | main() |
I'm kinda curious about the argument annotation *, could you point me to documentation explaining its usage?
or tell me what's the advantage here..