diff --git a/swh/storage/backfill.py b/swh/storage/backfill.py --- a/swh/storage/backfill.py +++ b/swh/storage/backfill.py @@ -20,7 +20,11 @@ from swh.core.db import BaseDb from swh.journal.writer.kafka import KafkaJournalWriter -from swh.storage.converters import db_to_release, db_to_revision +from swh.storage.converters import ( + db_to_raw_extrinsic_metadata, + db_to_release, + db_to_revision, +) from swh.storage.replay import object_converter_fn @@ -30,6 +34,9 @@ "content": "sha1", "skipped_content": "sha1", "directory": "id", + "metadata_authority": "type, url", + "metadata_fetcher": "name, version", + "raw_extrinsic_metadata": "id", "revision": "revision.id", "release": "release.id", "snapshot": "id", @@ -59,6 +66,26 @@ "reason", ], "directory": ["id", "dir_entries", "file_entries", "rev_entries"], + "metadata_authority": ["type", "url", "metadata",], + "metadata_fetcher": ["name", "version", "metadata",], + "raw_extrinsic_metadata": [ + "raw_extrinsic_metadata.type", + "raw_extrinsic_metadata.id", + "metadata_authority.type", + "metadata_authority.url", + "metadata_fetcher.name", + "metadata_fetcher.version", + "discovery_date", + "format", + "raw_extrinsic_metadata.metadata", + "origin", + "visit", + "snapshot", + "release", + "revision", + "path", + "directory", + ], "revision": [ ("revision.id", "id"), "date", @@ -123,6 +150,11 @@ ], "origin_visit": ["origin on origin_visit.origin=origin.id"], "origin_visit_status": ["origin on origin_visit_status.origin=origin.id"], + "raw_extrinsic_metadata": [ + "metadata_authority on " + "raw_extrinsic_metadata.authority_id=metadata_authority.id", + "metadata_fetcher on raw_extrinsic_metadata.fetcher_id=metadata_fetcher.id", + ], } @@ -160,6 +192,14 @@ return directory +def raw_extrinsic_metadata_converter(db, metadata): + """Convert revision from the flat representation to swh model + compatible objects. + + """ + return db_to_raw_extrinsic_metadata(metadata).to_dict() + + def revision_converter(db, revision): """Convert revision from the flat representation to swh model compatible objects. @@ -205,6 +245,7 @@ CONVERTERS = { "directory": directory_converter, + "raw_extrinsic_metadata": raw_extrinsic_metadata_converter, "revision": revision_converter, "release": release_converter, "snapshot": snapshot_converter, diff --git a/swh/storage/converters.py b/swh/storage/converters.py --- a/swh/storage/converters.py +++ b/swh/storage/converters.py @@ -9,8 +9,18 @@ from swh.core.utils import encode_with_unescape from swh.model import identifiers +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) from swh.model.hashutil import MultiHash +from .utils import map_optional + DEFAULT_AUTHOR = { "fullname": None, @@ -288,6 +298,34 @@ return ret +def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata: + type_ = MetadataTargetType(row["raw_extrinsic_metadata.type"]) + id_ = row["raw_extrinsic_metadata.id"] + if type_ != MetadataTargetType.ORIGIN: + id_ = parse_swhid(id_) + return RawExtrinsicMetadata( + type=type_, + id=id_, + authority=MetadataAuthority( + type=MetadataAuthorityType(row["metadata_authority.type"]), + url=row["metadata_authority.url"], + ), + fetcher=MetadataFetcher( + name=row["metadata_fetcher.name"], version=row["metadata_fetcher.version"], + ), + discovery_date=row["discovery_date"], + format=row["format"], + metadata=row["raw_extrinsic_metadata.metadata"], + origin=row["origin"], + visit=row["visit"], + snapshot=map_optional(parse_swhid, row["snapshot"]), + release=map_optional(parse_swhid, row["release"]), + revision=map_optional(parse_swhid, row["revision"]), + path=row["path"], + directory=map_optional(parse_swhid, row["directory"]), + ) + + def origin_url_to_sha1(origin_url): """Convert an origin URL to a sha1. Encodes URL to utf-8.""" return MultiHash.from_data(origin_url.encode("utf-8"), {"sha1"}).digest()["sha1"] diff --git a/swh/storage/replay.py b/swh/storage/replay.py --- a/swh/storage/replay.py +++ b/swh/storage/replay.py @@ -19,13 +19,16 @@ BaseModel, Content, Directory, + MetadataAuthority, + MetadataFetcher, Origin, OriginVisit, OriginVisitStatus, + RawExtrinsicMetadata, + Release, Revision, SkippedContent, Snapshot, - Release, ) from swh.storage.exc import HashCollision @@ -45,6 +48,9 @@ "directory": Directory.from_dict, "content": Content.from_dict, "skipped_content": SkippedContent.from_dict, + "metadata_authority": MetadataAuthority.from_dict, + "metadata_fetcher": MetadataFetcher.from_dict, + "raw_extrinsic_metadata": RawExtrinsicMetadata.from_dict, } diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -25,7 +25,7 @@ import psycopg2.errors from swh.core.api.serializers import msgpack_loads, msgpack_dumps -from swh.model.identifiers import parse_swhid, SWHID +from swh.model.identifiers import SWHID from swh.model.model import ( Content, Directory, @@ -1277,35 +1277,8 @@ rows = [dict(zip(db.raw_extrinsic_metadata_get_cols, row)) for row in rows] results = [] for row in rows: - row = row.copy() - row.pop("metadata_fetcher.id") - assert str(id) == row["raw_extrinsic_metadata.id"] - - result = RawExtrinsicMetadata( - type=MetadataTargetType(row["raw_extrinsic_metadata.type"]), - id=id, - authority=MetadataAuthority( - type=MetadataAuthorityType(row["metadata_authority.type"]), - url=row["metadata_authority.url"], - ), - fetcher=MetadataFetcher( - name=row["metadata_fetcher.name"], - version=row["metadata_fetcher.version"], - ), - discovery_date=row["discovery_date"], - format=row["format"], - metadata=row["raw_extrinsic_metadata.metadata"], - origin=row["origin"], - visit=row["visit"], - snapshot=map_optional(parse_swhid, row["snapshot"]), - release=map_optional(parse_swhid, row["release"]), - revision=map_optional(parse_swhid, row["revision"]), - path=row["path"], - directory=map_optional(parse_swhid, row["directory"]), - ) - - results.append(result) + results.append(converters.db_to_raw_extrinsic_metadata(row)) if len(results) > limit: results.pop() diff --git a/swh/storage/tests/test_backfill.py b/swh/storage/tests/test_backfill.py --- a/swh/storage/tests/test_backfill.py +++ b/swh/storage/tests/test_backfill.py @@ -34,7 +34,7 @@ with pytest.raises(ValueError) as e: JournalBackfiller(config) - error = "Configuration error: The following keys must be" " provided: %s" % ( + error = "Configuration error: The following keys must be provided: %s" % ( ",".join([key]), ) assert e.value.args[0] == error @@ -167,12 +167,15 @@ "content": lambda start, end: [(None, None)], "skipped_content": lambda start, end: [(None, None)], "directory": lambda start, end: [(None, None)], + "metadata_authority": lambda start, end: [(None, None)], + "metadata_fetcher": lambda start, end: [(None, None)], "revision": lambda start, end: [(None, None)], "release": lambda start, end: [(None, None)], "snapshot": lambda start, end: [(None, None)], "origin": lambda start, end: [(None, 10000)], "origin_visit": lambda start, end: [(None, 10000)], "origin_visit_status": lambda start, end: [(None, 10000)], + "raw_extrinsic_metadata": lambda start, end: [(None, None)], } diff --git a/swh/storage/tests/test_kafka_writer.py b/swh/storage/tests/test_kafka_writer.py --- a/swh/storage/tests/test_kafka_writer.py +++ b/swh/storage/tests/test_kafka_writer.py @@ -43,11 +43,14 @@ "content", "skipped_content", "directory", + "metadata_authority", + "metadata_fetcher", "revision", "release", "snapshot", "origin", "origin_visit_status", + "raw_extrinsic_metadata", ): method(objs) expected_messages += len(objs) @@ -70,9 +73,12 @@ for obj_type in ( "content", "directory", + "metadata_authority", + "metadata_fetcher", "origin", "origin_visit", "origin_visit_status", + "raw_extrinsic_metadata", "release", "revision", "snapshot", @@ -123,9 +129,12 @@ for obj_type in ( "content", "directory", + "metadata_authority", + "metadata_fetcher", "origin", "origin_visit", "origin_visit_status", + "raw_extrinsic_metadata", "release", "revision", "snapshot",