diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -304,19 +304,20 @@ revs_to_delete = [] origs_to_delete = [] for (orig_item, rev_item) in results: - assert rev_item['mappings'] == orig_item['mappings'] - if rev_item['mappings']: - # Only store translated metadata if we found a metadata file. - # Otherwise it's just an empty dict with a "@context" key. - if rev_item not in rev_metadata: - rev_metadata.append(rev_item) - if orig_item not in orig_metadata: - orig_metadata.append(orig_item) - else: + assert rev_item['metadata'] == orig_item['metadata'] + if not rev_item['metadata'] or \ + rev_item['metadata'].keys() <= {'@context'}: + # If we didn't find any metadata, don't store a DB record + # (and delete existing ones, if any) if rev_item not in revs_to_delete: revs_to_delete.append(rev_item) if orig_item not in origs_to_delete: origs_to_delete.append(orig_item) + else: + if rev_item not in rev_metadata: + rev_metadata.append(rev_item) + if orig_item not in orig_metadata: + orig_metadata.append(orig_item) if rev_metadata: self.idx_storage.revision_intrinsic_metadata_add( @@ -326,7 +327,7 @@ orig_metadata, conflict_update=conflict_update) # revs_to_delete should always be empty unless we changed a mapping - # to detect less files. + # to detect less files or less content. # However, origs_to_delete may be empty whenever an upstream deletes # a metadata file. if origs_to_delete: diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -163,7 +163,7 @@ assert len(results) == 2 -def test_origin_metadata_indexer_no_metadata( +def test_origin_metadata_indexer_no_metadata_file( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) @@ -185,6 +185,52 @@ assert results == [] +def test_origin_metadata_indexer_no_metadata( + idx_storage, storage, obj_storage): + + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + with patch('swh.indexer.metadata.RevisionMetadataIndexer' + '.translate_revision_intrinsic_metadata', + return_value=(['npm'], {'@context': 'foo'})): + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) + assert results == [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results == [] + + +def test_origin_metadata_indexer_error( + idx_storage, storage, obj_storage): + + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + with patch('swh.indexer.metadata.RevisionMetadataIndexer' + '.translate_revision_intrinsic_metadata', + return_value=None): + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) + assert results == [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results == [] + + def test_origin_metadata_indexer_delete_metadata( idx_storage, storage, obj_storage):