diff --git a/swh/storage/converters.py b/swh/storage/converters.py index 7bdc4421..541d5aa2 100644 --- a/swh/storage/converters.py +++ b/swh/storage/converters.py @@ -1,315 +1,320 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime +from typing import Optional, Dict + from swh.core.utils import decode_with_escape, encode_with_unescape from swh.model import identifiers from swh.model.hashutil import MultiHash DEFAULT_AUTHOR = { "fullname": None, "name": None, "email": None, } DEFAULT_DATE = { "timestamp": None, "offset": 0, "neg_utc_offset": None, } def author_to_db(author): """Convert a swh-model author to its DB representation. Args: author: a :mod:`swh.model` compatible author Returns: dict: a dictionary with three keys: author, fullname and email """ if author is None: return DEFAULT_AUTHOR return author -def db_to_author(fullname, name, email): +def db_to_author( + fullname: Optional[bytes], name: Optional[bytes], email: Optional[bytes] +) -> Optional[Dict[str, Optional[bytes]]]: """Convert the DB representation of an author to a swh-model author. Args: - id (long): the author's identifier fullname (bytes): the author's fullname name (bytes): the author's name email (bytes): the author's email Returns: - dict: a dictionary with four keys: id, fullname, name and email, or - None if the id is None + a dictionary with three keys (fullname, name and email), or + None if all the arguments are None. """ + if (fullname, name, email) == (None, None, None): + return None return { "fullname": fullname, "name": name, "email": email, } def git_headers_to_db(git_headers): """Convert git headers to their database representation. We convert the bytes to unicode by decoding them into utf-8 and replacing invalid utf-8 sequences with backslash escapes. """ ret = [] for key, values in git_headers: if isinstance(values, list): ret.append([key, [decode_with_escape(value) for value in values]]) else: ret.append([key, decode_with_escape(values)]) return ret def db_to_git_headers(db_git_headers): ret = [] for key, values in db_git_headers: if isinstance(values, list): ret.append([key, [encode_with_unescape(value) for value in values]]) else: ret.append([key, encode_with_unescape(values)]) return ret def db_to_date(date, offset, neg_utc_offset): """Convert the DB representation of a date to a swh-model compatible date. Args: date (datetime.datetime): a date pulled out of the database offset (int): an integer number of minutes representing an UTC offset neg_utc_offset (boolean): whether an utc offset is negative Returns: dict: a dict with three keys: - timestamp: a timestamp from UTC - offset: the number of minutes since UTC - negative_utc: whether a null UTC offset is negative """ if date is None: return None return { "timestamp": { "seconds": int(date.timestamp()), "microseconds": date.microsecond, }, "offset": offset, "negative_utc": neg_utc_offset, } def date_to_db(date_offset): """Convert a swh-model date_offset to its DB representation. Args: date_offset: a :mod:`swh.model` compatible date_offset Returns: dict: a dictionary with three keys: - timestamp: a date in ISO format - offset: the UTC offset in minutes - neg_utc_offset: a boolean indicating whether a null offset is negative or positive. """ if date_offset is None: return DEFAULT_DATE normalized = identifiers.normalize_timestamp(date_offset) ts = normalized["timestamp"] seconds = ts.get("seconds", 0) microseconds = ts.get("microseconds", 0) timestamp = datetime.datetime.fromtimestamp(seconds, datetime.timezone.utc) timestamp = timestamp.replace(microsecond=microseconds) return { # PostgreSQL supports isoformatted timestamps "timestamp": timestamp.isoformat(), "offset": normalized["offset"], "neg_utc_offset": normalized["negative_utc"], } def revision_to_db(rev): """Convert a swh-model revision to its database representation. """ revision = rev.to_dict() author = author_to_db(revision["author"]) date = date_to_db(revision["date"]) committer = author_to_db(revision["committer"]) committer_date = date_to_db(revision["committer_date"]) metadata = revision["metadata"] if metadata and "extra_headers" in metadata: metadata = metadata.copy() extra_headers = git_headers_to_db(metadata["extra_headers"]) metadata["extra_headers"] = extra_headers return { "id": revision["id"], "author_fullname": author["fullname"], "author_name": author["name"], "author_email": author["email"], "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], "committer_fullname": committer["fullname"], "committer_name": committer["name"], "committer_email": committer["email"], "committer_date": committer_date["timestamp"], "committer_date_offset": committer_date["offset"], "committer_date_neg_utc_offset": committer_date["neg_utc_offset"], "type": revision["type"], "directory": revision["directory"], "message": revision["message"], "metadata": metadata, "synthetic": revision["synthetic"], "parents": [ {"id": revision["id"], "parent_id": parent, "parent_rank": i,} for i, parent in enumerate(revision["parents"]) ], } def db_to_revision(db_revision): """Convert a database representation of a revision to its swh-model representation.""" author = db_to_author( db_revision["author_fullname"], db_revision["author_name"], db_revision["author_email"], ) date = db_to_date( db_revision["date"], db_revision["date_offset"], db_revision["date_neg_utc_offset"], ) committer = db_to_author( db_revision["committer_fullname"], db_revision["committer_name"], db_revision["committer_email"], ) committer_date = db_to_date( db_revision["committer_date"], db_revision["committer_date_offset"], db_revision["committer_date_neg_utc_offset"], ) metadata = db_revision["metadata"] if metadata and "extra_headers" in metadata: extra_headers = db_to_git_headers(metadata["extra_headers"]) metadata["extra_headers"] = extra_headers parents = [] if "parents" in db_revision: for parent in db_revision["parents"]: if parent: parents.append(parent) ret = { "id": db_revision["id"], "author": author, "date": date, "committer": committer, "committer_date": committer_date, "type": db_revision["type"], "directory": db_revision["directory"], "message": db_revision["message"], "metadata": metadata, "synthetic": db_revision["synthetic"], "parents": parents, } if "object_id" in db_revision: ret["object_id"] = db_revision["object_id"] return ret def release_to_db(rel): """Convert a swh-model release to its database representation. """ release = rel.to_dict() author = author_to_db(release["author"]) date = date_to_db(release["date"]) return { "id": release["id"], "author_fullname": author["fullname"], "author_name": author["name"], "author_email": author["email"], "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], "name": release["name"], "target": release["target"], "target_type": release["target_type"], "comment": release["message"], "synthetic": release["synthetic"], } def db_to_release(db_release): """Convert a database representation of a release to its swh-model representation. """ author = db_to_author( db_release["author_fullname"], db_release["author_name"], db_release["author_email"], ) date = db_to_date( db_release["date"], db_release["date_offset"], db_release["date_neg_utc_offset"] ) ret = { "author": author, "date": date, "id": db_release["id"], "name": db_release["name"], "message": db_release["comment"], "synthetic": db_release["synthetic"], "target": db_release["target"], "target_type": db_release["target_type"], } if "object_id" in db_release: ret["object_id"] = db_release["object_id"] return ret def origin_url_to_sha1(origin_url): """Convert an origin URL to a sha1. Encodes URL to utf-8.""" return MultiHash.from_data(origin_url.encode("utf-8"), {"sha1"}).digest()["sha1"] diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py index 72443ec4..83f63e11 100644 --- a/swh/storage/tests/test_converters.py +++ b/swh/storage/tests/test_converters.py @@ -1,152 +1,160 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.storage import converters def test_date_to_db(): date_to_db = converters.date_to_db assert date_to_db(None) == {"timestamp": None, "offset": 0, "neg_utc_offset": None} assert date_to_db( {"timestamp": 1234567890, "offset": 120, "negative_utc": False,} ) == { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 120, "neg_utc_offset": False, } assert date_to_db( {"timestamp": 1123456789, "offset": 0, "negative_utc": True,} ) == { "timestamp": "2005-08-07T23:19:49+00:00", "offset": 0, "neg_utc_offset": True, } assert date_to_db( {"timestamp": 1234567890, "offset": 42, "negative_utc": False,} ) == { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 42, "neg_utc_offset": False, } assert date_to_db( {"timestamp": 1634366813, "offset": -120, "negative_utc": False,} ) == { "timestamp": "2021-10-16T06:46:53+00:00", "offset": -120, "neg_utc_offset": False, } def test_db_to_author(): # when actual_author = converters.db_to_author(b"fullname", b"name", b"email") # then assert actual_author == { "fullname": b"fullname", "name": b"name", "email": b"email", } +def test_db_to_author_none(): + # when + actual_author = converters.db_to_author(None, None, None) + + # then + assert actual_author is None + + def test_db_to_revision(): # when actual_revision = converters.db_to_revision( { "id": "revision-id", "date": None, "date_offset": None, "date_neg_utc_offset": None, "committer_date": None, "committer_date_offset": None, "committer_date_neg_utc_offset": None, "type": "rev", "directory": b"dir-sha1", "message": b"commit message", "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", "committer_fullname": b"comm-fullname", "committer_name": b"comm-name", "committer_email": b"comm-email", "metadata": {}, "synthetic": False, "parents": [123, 456], } ) # then assert actual_revision == { "id": "revision-id", "author": { "fullname": b"auth-fullname", "name": b"auth-name", "email": b"auth-email", }, "date": None, "committer": { "fullname": b"comm-fullname", "name": b"comm-name", "email": b"comm-email", }, "committer_date": None, "type": "rev", "directory": b"dir-sha1", "message": b"commit message", "metadata": {}, "synthetic": False, "parents": [123, 456], } def test_db_to_release(): # when actual_release = converters.db_to_release( { "id": b"release-id", "target": b"revision-id", "target_type": "revision", "date": None, "date_offset": None, "date_neg_utc_offset": None, "name": b"release-name", "comment": b"release comment", "synthetic": True, "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", } ) # then assert actual_release == { "author": { "fullname": b"auth-fullname", "name": b"auth-name", "email": b"auth-email", }, "date": None, "id": b"release-id", "name": b"release-name", "message": b"release comment", "synthetic": True, "target": b"revision-id", "target_type": "revision", } def test_db_to_git_headers(): raw_data = [ ["gpgsig", b"garbage\x89a\x43b\x14"], ["extra", [b"foo\\\\\\o", b"bar\\", b"inval\\\\\x99id"]], ] db_data = converters.git_headers_to_db(raw_data) loop = converters.db_to_git_headers(db_data) assert raw_data == loop