diff --git a/swh/storage/postgresql/converters.py b/swh/storage/postgresql/converters.py index f14307c7..e2649fcf 100644 --- a/swh/storage/postgresql/converters.py +++ b/swh/storage/postgresql/converters.py @@ -1,338 +1,343 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import math from typing import Any, Dict, Optional import warnings from swh.core.utils import encode_with_unescape from swh.model.model import ( ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ObjectType, Origin, Person, RawExtrinsicMetadata, Release, Revision, RevisionType, Timestamp, TimestampWithTimezone, ) from swh.model.swhids import CoreSWHID, ExtendedSWHID from swh.model.swhids import ObjectType as SwhidObjectType from ..utils import map_optional DEFAULT_AUTHOR = { "fullname": None, "name": None, "email": None, } DEFAULT_DATE = { "timestamp": None, "offset": 0, "neg_utc_offset": None, "offset_bytes": None, } def author_to_db(author: Optional[Person]) -> Dict[str, Any]: """Convert a swh-model author to its DB representation. Args: author: a :mod:`swh.model` compatible author Returns: dict: a dictionary with three keys: author, fullname and email """ if author is None: return DEFAULT_AUTHOR return author.to_dict() def db_to_author( fullname: Optional[bytes], name: Optional[bytes], email: Optional[bytes] ) -> Optional[Person]: """Convert the DB representation of an author to a swh-model author. Args: fullname (bytes): the author's fullname name (bytes): the author's name email (bytes): the author's email Returns: a Person object, or None if 'fullname' is None. """ if fullname is None: return None + + if name is None and email is None: + # The fullname hasn't been parsed, try that again + return Person.from_fullname(fullname) + return Person(fullname=fullname, name=name, email=email,) def db_to_git_headers(db_git_headers): ret = [] for key, value in db_git_headers: ret.append([key.encode("utf-8"), encode_with_unescape(value)]) return ret def db_to_date( date: Optional[datetime.datetime], offset_bytes: bytes, ) -> Optional[TimestampWithTimezone]: """Convert the DB representation of a date to a swh-model compatible date. Args: date: a date pulled out of the database offset_bytes: a byte representation of the latter two, usually as "+HHMM" or "-HHMM" Returns: a TimestampWithTimezone, or None if the date is None. """ if date is None: return None return TimestampWithTimezone( timestamp=Timestamp( # we use floor() instead of int() to round down, because of negative dates seconds=math.floor(date.timestamp()), microseconds=date.microsecond, ), offset_bytes=offset_bytes, ) def date_to_db(ts_with_tz: Optional[TimestampWithTimezone]) -> Dict[str, Any]: """Convert a swh-model date_offset to its DB representation. Args: ts_with_tz: a TimestampWithTimezone object Returns: dict: a dictionary with these keys: - timestamp: a date in ISO format - offset_bytes: a byte representation of the latter two, usually as "+HHMM" or "-HHMM" """ if ts_with_tz is None: return DEFAULT_DATE ts = ts_with_tz.timestamp timestamp = datetime.datetime.fromtimestamp(ts.seconds, datetime.timezone.utc) timestamp = timestamp.replace(microsecond=ts.microseconds) return { # PostgreSQL supports isoformatted timestamps "timestamp": timestamp.isoformat(), "offset": ts_with_tz.offset_minutes(), "neg_utc_offset": ts_with_tz.offset_minutes() == 0 and ts_with_tz.offset_bytes.startswith(b"-"), "offset_bytes": ts_with_tz.offset_bytes, } def revision_to_db(revision: Revision) -> Dict[str, Any]: """Convert a swh-model revision to its database representation. """ author = author_to_db(revision.author) date = date_to_db(revision.date) committer = author_to_db(revision.committer) committer_date = date_to_db(revision.committer_date) return { "id": revision.id, "author_fullname": author["fullname"], "author_name": author["name"], "author_email": author["email"], "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], "date_offset_bytes": date["offset_bytes"], "committer_fullname": committer["fullname"], "committer_name": committer["name"], "committer_email": committer["email"], "committer_date": committer_date["timestamp"], "committer_date_offset": committer_date["offset"], "committer_date_neg_utc_offset": committer_date["neg_utc_offset"], "committer_date_offset_bytes": committer_date["offset_bytes"], "type": revision.type.value, "directory": revision.directory, "message": revision.message, "metadata": None if revision.metadata is None else dict(revision.metadata), "synthetic": revision.synthetic, "extra_headers": revision.extra_headers, "raw_manifest": revision.raw_manifest, "parents": [ {"id": revision.id, "parent_id": parent, "parent_rank": i,} for i, parent in enumerate(revision.parents) ], } def db_to_revision(db_revision: Dict[str, Any]) -> Optional[Revision]: """Convert a database representation of a revision to its swh-model representation.""" if db_revision["type"] is None: assert all( v is None for (k, v) in db_revision.items() if k not in ("id", "parents") ) return None author = db_to_author( db_revision["author_fullname"], db_revision["author_name"], db_revision["author_email"], ) date = db_to_date(db_revision["date"], db_revision["date_offset_bytes"],) committer = db_to_author( db_revision["committer_fullname"], db_revision["committer_name"], db_revision["committer_email"], ) committer_date = db_to_date( db_revision["committer_date"], db_revision["committer_date_offset_bytes"], ) assert author, "author is None" assert committer, "committer is None" parents = [] if "parents" in db_revision: for parent in db_revision["parents"]: if parent: parents.append(parent) metadata = db_revision["metadata"] extra_headers = db_revision["extra_headers"] if not extra_headers: if metadata and "extra_headers" in metadata: extra_headers = db_to_git_headers(metadata.pop("extra_headers")) else: # For older versions of the database that were not migrated to schema v161 extra_headers = () return Revision( id=db_revision["id"], author=author, date=date, committer=committer, committer_date=committer_date, type=RevisionType(db_revision["type"]), directory=db_revision["directory"], message=db_revision["message"], metadata=metadata, synthetic=db_revision["synthetic"], extra_headers=extra_headers, parents=tuple(parents), raw_manifest=db_revision["raw_manifest"], ) def release_to_db(release: Release) -> Dict[str, Any]: """Convert a swh-model release to its database representation. """ author = author_to_db(release.author) date = date_to_db(release.date) return { "id": release.id, "author_fullname": author["fullname"], "author_name": author["name"], "author_email": author["email"], "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], "date_offset_bytes": date["offset_bytes"], "name": release.name, "target": release.target, "target_type": release.target_type.value, "comment": release.message, "synthetic": release.synthetic, "raw_manifest": release.raw_manifest, } def db_to_release(db_release: Dict[str, Any]) -> Optional[Release]: """Convert a database representation of a release to its swh-model representation. """ if db_release["target_type"] is None: assert all(v is None for (k, v) in db_release.items() if k != "id") return None author = db_to_author( db_release["author_fullname"], db_release["author_name"], db_release["author_email"], ) date = db_to_date(db_release["date"], db_release["date_offset_bytes"],) return Release( author=author, date=date, id=db_release["id"], name=db_release["name"], message=db_release["comment"], synthetic=db_release["synthetic"], target=db_release["target"], target_type=ObjectType(db_release["target_type"]), raw_manifest=db_release["raw_manifest"], ) def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata: target = row["raw_extrinsic_metadata.target"] if not target.startswith("swh:1:"): warnings.warn( "Fetching raw_extrinsic_metadata row with URL target", DeprecationWarning ) target = str(Origin(url=target).swhid()) return RawExtrinsicMetadata( target=ExtendedSWHID.from_string(target), authority=MetadataAuthority( type=MetadataAuthorityType(row["metadata_authority.type"]), url=row["metadata_authority.url"], ), fetcher=MetadataFetcher( name=row["metadata_fetcher.name"], version=row["metadata_fetcher.version"], ), discovery_date=row["discovery_date"], format=row["format"], metadata=row["raw_extrinsic_metadata.metadata"], origin=row["origin"], visit=row["visit"], snapshot=map_optional(CoreSWHID.from_string, row["snapshot"]), release=map_optional(CoreSWHID.from_string, row["release"]), revision=map_optional(CoreSWHID.from_string, row["revision"]), path=row["path"], directory=map_optional(CoreSWHID.from_string, row["directory"]), ) def db_to_extid(row) -> ExtID: return ExtID( extid=row["extid"], extid_type=row["extid_type"], extid_version=row.get("extid_version", 0), target=CoreSWHID( object_id=row["target"], object_type=SwhidObjectType[row["target_type"].upper()], ), ) diff --git a/swh/storage/tests/test_postgresql_converters.py b/swh/storage/tests/test_postgresql_converters.py index 0a52874c..75c9567d 100644 --- a/swh/storage/tests/test_postgresql_converters.py +++ b/swh/storage/tests/test_postgresql_converters.py @@ -1,307 +1,312 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import pytest from swh.model.model import ( ObjectType, Person, Release, Revision, RevisionType, Timestamp, TimestampWithTimezone, ) from swh.model.swhids import ExtendedSWHID from swh.storage.postgresql import converters @pytest.mark.parametrize( "model_date,db_date", [ ( None, { "timestamp": None, "offset": 0, "neg_utc_offset": None, "offset_bytes": None, }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0,), offset_bytes=b"+0200", ), { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 120, "neg_utc_offset": False, "offset_bytes": b"+0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0,), offset_bytes=b"-0000", ), { "timestamp": "2005-08-07T23:19:49+00:00", "offset": 0, "neg_utc_offset": True, "offset_bytes": b"-0000", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0,), offset_bytes=b"+0042", ), { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 42, "neg_utc_offset": False, "offset_bytes": b"+0042", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0,), offset_bytes=b"-0200", ), { "timestamp": "2021-10-16T06:46:53+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=0, microseconds=0,), offset_bytes=b"-0200", ), { "timestamp": "1970-01-01T00:00:00+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=0, microseconds=1,), offset_bytes=b"-0200", ), { "timestamp": "1970-01-01T00:00:00.000001+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=-1, microseconds=0,), offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:59:59+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=-1, microseconds=1,), offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:59:59.000001+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=-3600, microseconds=0,), offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:00:00+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=-3600, microseconds=1,), offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:00:00.000001+00:00", "offset": -120, "neg_utc_offset": False, "offset_bytes": b"-0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0,), offset_bytes=b"+200", ), { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 120, "neg_utc_offset": False, "offset_bytes": b"+200", }, ), ], ) def test_date(model_date, db_date): assert converters.date_to_db(model_date) == db_date assert ( converters.db_to_date( date=None if db_date["timestamp"] is None else datetime.datetime.fromisoformat(db_date["timestamp"]), offset_bytes=db_date["offset_bytes"], ) == model_date ) def test_db_to_author(): # when actual_author = converters.db_to_author(b"fullname", b"name", b"email") # then assert actual_author == Person(fullname=b"fullname", name=b"name", email=b"email",) def test_db_to_author_none(): # when actual_author = converters.db_to_author(None, None, None) # then assert actual_author is None +def test_db_to_author_unparsed(): + author = converters.db_to_author(b"Fullname ", None, None) + assert author == Person.from_fullname(b"Fullname ") + + def test_db_to_revision(): # when actual_revision = converters.db_to_revision( { "id": b"revision-id", "date": None, "date_offset": None, "date_neg_utc_offset": None, "date_offset_bytes": None, "committer_date": None, "committer_date_offset": None, "committer_date_neg_utc_offset": None, "committer_date_offset_bytes": None, "type": "git", "directory": b"dir-sha1", "message": b"commit message", "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", "committer_fullname": b"comm-fullname", "committer_name": b"comm-name", "committer_email": b"comm-email", "metadata": {}, "synthetic": False, "extra_headers": (), "raw_manifest": None, "parents": [b"123", b"456"], } ) # then assert actual_revision == Revision( id=b"revision-id", author=Person( fullname=b"auth-fullname", name=b"auth-name", email=b"auth-email", ), date=None, committer=Person( fullname=b"comm-fullname", name=b"comm-name", email=b"comm-email", ), committer_date=None, type=RevisionType.GIT, directory=b"dir-sha1", message=b"commit message", metadata={}, synthetic=False, extra_headers=(), parents=(b"123", b"456"), ) def test_db_to_release(): # when actual_release = converters.db_to_release( { "id": b"release-id", "target": b"revision-id", "target_type": "revision", "date": None, "date_offset": None, "date_neg_utc_offset": None, "date_offset_bytes": None, "name": b"release-name", "comment": b"release comment", "synthetic": True, "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", "raw_manifest": None, } ) # then assert actual_release == Release( author=Person( fullname=b"auth-fullname", name=b"auth-name", email=b"auth-email", ), date=None, id=b"release-id", name=b"release-name", message=b"release comment", synthetic=True, target=b"revision-id", target_type=ObjectType.REVISION, ) def test_db_to_raw_extrinsic_metadata_raw_target(): row = { "raw_extrinsic_metadata.target": "https://example.com/origin", "metadata_authority.type": "forge", "metadata_authority.url": "https://example.com", "metadata_fetcher.name": "swh.lister", "metadata_fetcher.version": "1.0.0", "discovery_date": datetime.datetime( 2021, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc ), "format": "text/plain", "raw_extrinsic_metadata.metadata": b"metadata", "origin": None, "visit": None, "snapshot": None, "release": None, "revision": None, "path": None, "directory": None, } with pytest.deprecated_call(): computed_rem = converters.db_to_raw_extrinsic_metadata(row) assert computed_rem.target == ExtendedSWHID.from_string( "swh:1:ori:5a7439b0b93a5d230b6a67b8e7e0f7dc3c9f6c70" )