diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,9 +1,9 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import datetime, timedelta, timezone +from datetime import datetime from os import path from typing import Any, Dict, Generator, List @@ -14,7 +14,7 @@ from pytest_postgresql.factories import postgresql from swh.journal.serializers import msgpack_ext_hook -from swh.model.model import BaseModel +from swh.model.model import BaseModel, TimestampWithTimezone from swh.provenance import get_provenance, get_provenance_storage from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface @@ -142,7 +142,11 @@ strict_map_key=False, timestamp=3, # convert Timestamp in datetime objects (tz UTC) ) - for objtype, objd in unpacker: + for msg in unpacker: + if len(msg) == 2: # old format + objtype, objd = msg + else: # now we should have a triplet (type, key, value) + objtype, _, objd = msg data.setdefault(objtype, []).append(objd) return data @@ -154,10 +158,5 @@ return obj -# TODO: remove this function in favour of TimestampWithTimezone.to_datetime -# from swh.model.model def ts2dt(ts: Dict[str, Any]) -> datetime: - timestamp = datetime.fromtimestamp( - ts["timestamp"]["seconds"], timezone(timedelta(minutes=ts["offset"])) - ) - return timestamp.replace(microsecond=ts["timestamp"]["microseconds"]) + return TimestampWithTimezone.from_dict(ts).to_datetime() diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md --- a/swh/provenance/tests/data/README.md +++ b/swh/provenance/tests/data/README.md @@ -3,6 +3,14 @@ This directory contains datasets used by `test_provenance_heurstics` tests of the provenance index database. +## Datasets + +There are currently 3 dataset: + +- cmdbts2: original dataset +- out-of-order: with unsorted revisions +- with-merge: with merge revisions + Each dataset `xxx` consist in several parts: - a description of a git repository as a yaml file named `xxx_repo.yaml`, @@ -12,7 +20,17 @@ describing the expected result in the provenance database if ingested with the flag `lower` set or not set, and the `mindepth` value (integer, most often `1` or `2`). +### Generate datasets files +For each dataset `xxx`, execute a number of commands: + +``` +for dataset in cmdbts2 out-of-order with-merges; do + python generate_repo.py -C ${dataset}_repo.yaml $dataset > synthetic_${dataset}_template.txt + # you may want to edit/update synthetic files from this template, see below + python generate_storage_from_git.py $dataset +done +``` ## Git repos description file diff --git a/swh/provenance/tests/data/cmdbts2.msgpack b/swh/provenance/tests/data/cmdbts2.msgpack index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ Type[BaseModel]: - if object_type == "origin": - return Origin - elif object_type == "origin_visit": - return OriginVisit - elif object_type == "origin_visit_status": - return OriginVisitStatus - elif object_type == "content": - return Content - elif object_type == "directory": - return Directory - elif object_type == "revision": - return Revision - elif object_type == "snapshot": - return Snapshot - raise ValueError + return SWH_MODEL_OBJECT_TYPES[object_type] def data_to_model(data: Dict[str, List[dict]]) -> Dict[str, List[BaseModel]]: @@ -125,7 +113,7 @@ Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str] ] ], - src_obj: Union[Origin, Snapshot, Revision, Directory, Content], + src_obj: Union[Content, Directory, Origin, Release, Revision, Snapshot], dst_id: bytes, dst_type: ExtendedObjectType, ) -> None: @@ -182,25 +170,36 @@ for parent in revision.parents: add_link(edges, revision, parent, ExtendedObjectType.REVISION) + dir_entry_types = { + "file": ExtendedObjectType.CONTENT, + "dir": ExtendedObjectType.DIRECTORY, + "rev": ExtendedObjectType.REVISION, + } for directory in model["directory"]: assert isinstance(directory, Directory) nodes.add(directory.swhid()) for entry in directory.entries: assert isinstance(entry, DirectoryEntry) - if entry.type == "file": - target_type = ExtendedObjectType.CONTENT - elif entry.type == "dir": - target_type = ExtendedObjectType.DIRECTORY - elif entry.type == "rev": - target_type = ExtendedObjectType.REVISION - else: - assert False, "unknown directory entry type" - add_link(edges, directory, entry.target, target_type) + add_link(edges, directory, entry.target, dir_entry_types[entry.type]) for content in model["content"]: assert isinstance(content, Content) nodes.add(content.swhid()) + object_type = { + ObjectType.CONTENT: ExtendedObjectType.CONTENT, + ObjectType.DIRECTORY: ExtendedObjectType.DIRECTORY, + ObjectType.REVISION: ExtendedObjectType.REVISION, + ObjectType.RELEASE: ExtendedObjectType.RELEASE, + ObjectType.SNAPSHOT: ExtendedObjectType.SNAPSHOT, + } + for release in model["release"]: + assert isinstance(release, Release) + nodes.add(release.swhid()) + + if release.target is not None: + add_link(edges, release, release.target, object_type[release.target_type]) + return list(nodes), list(edges) diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -131,8 +131,8 @@ data = load_repo_data(repo) fill_storage(swh_storage, data) - assert len(data["origin"]) == 1 - assert {"url": origin_url} in data["origin"] + assert len(data["origin"]) >= 1 + assert origin_url in [o["url"] for o in data["origin"]] cfg = { "provenance": { diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py --- a/swh/provenance/tests/test_journal_client.py +++ b/swh/provenance/tests/test_journal_client.py @@ -41,7 +41,7 @@ # Prepare storage data data = load_repo_data("cmdbts2") - assert len(data["origin"]) == 1 + assert len(data["origin"]) >= 1 origin_url = data["origin"][0]["url"] fill_storage(swh_storage, data) @@ -95,7 +95,7 @@ # Prepare storage data data = load_repo_data("cmdbts2") - assert len(data["origin"]) == 1 + assert len(data["origin"]) >= 1 fill_storage(swh_storage, data) # Prepare configuration for cli call diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,4 +39,8 @@ origins = list(CSVOriginIterator(origins_csv)) assert origins - assert len(origins) == len(data["origin"]) + # there can be more origins, depending on the additional extra visits.yaml + # file used during dataset generation (see data/generate_storage_from_git) + assert len(origins) >= len(data["origin"]) + # but we can check it's a subset + assert set(o.url for o in origins) <= set(o["url"] for o in data["origin"])