diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1 +1 @@ -swh.model >= 0.10.0 +swh.model >= 0.12.0 diff --git a/swh/journal/pytest_plugin.py b/swh/journal/pytest_plugin.py --- a/swh/journal/pytest_plugin.py +++ b/swh/journal/pytest_plugin.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,7 +14,7 @@ import pytest from swh.journal.serializers import kafka_to_key, kafka_to_value, pprint_key -from swh.journal.tests.journal_data import TEST_OBJECTS +from swh.model.tests.swh_model_data import TEST_OBJECTS def ensure_lists(value: Any) -> Any: diff --git a/swh/journal/serializers.py b/swh/journal/serializers.py --- a/swh/journal/serializers.py +++ b/swh/journal/serializers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -71,6 +71,8 @@ return "{%s}" % ", ".join( f"{k}: {stringify_key_item(k, v)}" for k, v in key.items() ) + elif isinstance(key, tuple): + return "{%s}" % ", ".join(f"{k}: {stringify_key_item(k, v)}" for k, v in key) elif isinstance(key, bytes): return key.hex() else: @@ -93,6 +95,7 @@ def value_to_kafka(value: Any) -> bytes: """Serialize some data for storage in kafka""" + print("VALUE TO KAFKA", type(value)) return msgpack.packb( value, use_bin_type=True, diff --git a/swh/journal/tests/conftest.py b/swh/journal/tests/conftest.py --- a/swh/journal/tests/conftest.py +++ b/swh/journal/tests/conftest.py @@ -1,17 +1,13 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging from hypothesis.strategies import one_of -# for bw compat -from swh.journal.tests.journal_data import * # noqa from swh.model import hypothesis_strategies as strategies - -logger = logging.getLogger(__name__) +from swh.model.tests.swh_model_data import TEST_OBJECTS # noqa def objects_d(): diff --git a/swh/journal/tests/journal_data.py b/swh/journal/tests/journal_data.py deleted file mode 100644 --- a/swh/journal/tests/journal_data.py +++ /dev/null @@ -1,348 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import datetime -from typing import Dict, Sequence - -import attr - -from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex -from swh.model.identifiers import SWHID -from swh.model.model import ( - BaseModel, - Content, - Directory, - DirectoryEntry, - MetadataAuthority, - MetadataAuthorityType, - MetadataFetcher, - MetadataTargetType, - ObjectType, - Origin, - OriginVisit, - OriginVisitStatus, - Person, - RawExtrinsicMetadata, - Release, - Revision, - RevisionType, - SkippedContent, - Snapshot, - SnapshotBranch, - TargetType, - Timestamp, - TimestampWithTimezone, -) - -UTC = datetime.timezone.utc - -CONTENTS = [ - Content( - length=4, - data=f"foo{i}".encode(), - status="visible", - **MultiHash.from_data(f"foo{i}".encode()).digest(), - ) - for i in range(10) -] + [ - Content( - length=14, - data=f"forbidden foo{i}".encode(), - status="hidden", - **MultiHash.from_data(f"forbidden foo{i}".encode()).digest(), - ) - for i in range(10) -] - -SKIPPED_CONTENTS = [ - SkippedContent( - length=4, - status="absent", - reason=f"because chr({i}) != '*'", - **MultiHash.from_data(f"bar{i}".encode()).digest(), - ) - for i in range(2) -] - -duplicate_content1 = Content( - length=4, - sha1=hash_to_bytes("44973274ccef6ab4dfaaf86599792fa9c3fe4689"), - sha1_git=b"another-foo", - blake2s256=b"another-bar", - sha256=b"another-baz", - status="visible", -) - -# Craft a sha1 collision -sha1_array = bytearray(duplicate_content1.sha1_git) -sha1_array[0] += 1 -duplicate_content2 = attr.evolve(duplicate_content1, sha1_git=bytes(sha1_array)) - - -DUPLICATE_CONTENTS = [duplicate_content1, duplicate_content2] - - -COMMITTERS = [ - Person(fullname=b"foo", name=b"foo", email=b""), - Person(fullname=b"bar", name=b"bar", email=b""), -] - -DATES = [ - TimestampWithTimezone( - timestamp=Timestamp(seconds=1234567891, microseconds=0,), - offset=120, - negative_utc=False, - ), - TimestampWithTimezone( - timestamp=Timestamp(seconds=1234567892, microseconds=0,), - offset=120, - negative_utc=False, - ), -] - -REVISIONS = [ - Revision( - id=hash_to_bytes("4ca486e65eb68e4986aeef8227d2db1d56ce51b3"), - message=b"hello", - date=DATES[0], - committer=COMMITTERS[0], - author=COMMITTERS[0], - committer_date=DATES[0], - type=RevisionType.GIT, - directory=b"\x01" * 20, - synthetic=False, - metadata=None, - parents=(), - ), - Revision( - id=hash_to_bytes("677063f5c405d6fc1781fc56379c9a9adf43d3a0"), - message=b"hello again", - date=DATES[1], - committer=COMMITTERS[1], - author=COMMITTERS[1], - committer_date=DATES[1], - type=RevisionType.MERCURIAL, - directory=b"\x02" * 20, - synthetic=False, - metadata=None, - parents=(), - extra_headers=((b"foo", b"bar"),), - ), -] - -RELEASES = [ - Release( - id=hash_to_bytes("8059dc4e17fcd0e51ca3bcd6b80f4577d281fd08"), - name=b"v0.0.1", - date=TimestampWithTimezone( - timestamp=Timestamp(seconds=1234567890, microseconds=0,), - offset=120, - negative_utc=False, - ), - author=COMMITTERS[0], - target_type=ObjectType.REVISION, - target=b"\x04" * 20, - message=b"foo", - synthetic=False, - ), -] - -ORIGINS = [ - Origin(url="https://somewhere.org/den/fox",), - Origin(url="https://overtherainbow.org/fox/den",), -] - -ORIGIN_VISITS = [ - OriginVisit( - origin=ORIGINS[0].url, - date=datetime.datetime(2013, 5, 7, 4, 20, 39, 369271, tzinfo=UTC), - visit=1, - type="git", - ), - OriginVisit( - origin=ORIGINS[1].url, - date=datetime.datetime(2014, 11, 27, 17, 20, 39, tzinfo=UTC), - visit=1, - type="hg", - ), - OriginVisit( - origin=ORIGINS[0].url, - date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC), - visit=2, - type="git", - ), - OriginVisit( - origin=ORIGINS[0].url, - date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC), - visit=3, - type="git", - ), - OriginVisit( - origin=ORIGINS[1].url, - date=datetime.datetime(2015, 11, 27, 17, 20, 39, tzinfo=UTC), - visit=2, - type="hg", - ), -] - -# The origin-visit-status dates needs to be shifted slightly in the future from their -# visit dates counterpart. Otherwise, we are hitting storage-wise the "on conflict" -# ignore policy (because origin-visit-add creates an origin-visit-status with the same -# parameters from the origin-visit {origin, visit, date}... -ORIGIN_VISIT_STATUSES = [ - OriginVisitStatus( - origin=ORIGINS[0].url, - date=datetime.datetime(2013, 5, 7, 4, 20, 39, 432222, tzinfo=UTC), - visit=1, - type="git", - status="ongoing", - snapshot=None, - metadata=None, - ), - OriginVisitStatus( - origin=ORIGINS[1].url, - date=datetime.datetime(2014, 11, 27, 17, 21, 12, tzinfo=UTC), - visit=1, - type="hg", - status="ongoing", - snapshot=None, - metadata=None, - ), - OriginVisitStatus( - origin=ORIGINS[0].url, - date=datetime.datetime(2018, 11, 27, 17, 20, 59, tzinfo=UTC), - visit=2, - type="git", - status="ongoing", - snapshot=None, - metadata=None, - ), - OriginVisitStatus( - origin=ORIGINS[0].url, - date=datetime.datetime(2018, 11, 27, 17, 20, 49, tzinfo=UTC), - visit=3, - type="git", - status="full", - snapshot=hash_to_bytes("17d0066a4a80aba4a0e913532ee8ff2014f006a9"), - metadata=None, - ), - OriginVisitStatus( - origin=ORIGINS[1].url, - date=datetime.datetime(2015, 11, 27, 17, 22, 18, tzinfo=UTC), - visit=2, - type="hg", - status="partial", - snapshot=hash_to_bytes("8ce268b87faf03850693673c3eb5c9bb66e1ca38"), - metadata=None, - ), -] - - -DIRECTORIES = [ - Directory(id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=()), - Directory( - id=hash_to_bytes("21416d920e0ebf0df4a7888bed432873ed5cb3a7"), - entries=( - DirectoryEntry( - name=b"file1.ext", - perms=0o644, - type="file", - target=CONTENTS[0].sha1_git, - ), - DirectoryEntry( - name=b"dir1", - perms=0o755, - type="dir", - target=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), - ), - DirectoryEntry( - name=b"subprepo1", perms=0o160000, type="rev", target=REVISIONS[1].id, - ), - ), - ), -] - - -SNAPSHOTS = [ - Snapshot( - id=hash_to_bytes("17d0066a4a80aba4a0e913532ee8ff2014f006a9"), - branches={ - b"master": SnapshotBranch( - target_type=TargetType.REVISION, target=REVISIONS[0].id - ) - }, - ), - Snapshot( - id=hash_to_bytes("8ce268b87faf03850693673c3eb5c9bb66e1ca38"), - branches={ - b"target/revision": SnapshotBranch( - target_type=TargetType.REVISION, target=REVISIONS[0].id, - ), - b"target/alias": SnapshotBranch( - target_type=TargetType.ALIAS, target=b"target/revision" - ), - b"target/directory": SnapshotBranch( - target_type=TargetType.DIRECTORY, target=DIRECTORIES[0].id, - ), - b"target/release": SnapshotBranch( - target_type=TargetType.RELEASE, target=RELEASES[0].id - ), - b"target/snapshot": SnapshotBranch( - target_type=TargetType.SNAPSHOT, - target=hash_to_bytes("17d0066a4a80aba4a0e913532ee8ff2014f006a9"), - ), - }, - ), -] - - -METADATA_AUTHORITIES = [ - MetadataAuthority( - type=MetadataAuthorityType.FORGE, url="http://example.org/", metadata={}, - ), -] - -METADATA_FETCHERS = [ - MetadataFetcher(name="test-fetcher", version="1.0.0", metadata={},) -] - -RAW_EXTRINSIC_METADATA = [ - RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, - target="http://example.org/foo.git", - discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC), - authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None), - fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None), - format="json", - metadata=b'{"foo": "bar"}', - ), - RawExtrinsicMetadata( - type=MetadataTargetType.CONTENT, - target=SWHID( - object_type="content", object_id=hash_to_hex(CONTENTS[0].sha1_git) - ), - discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC), - authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None), - fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None), - format="json", - metadata=b'{"foo": "bar"}', - ), -] - - -TEST_OBJECTS: Dict[str, Sequence[BaseModel]] = { - "content": CONTENTS, - "directory": DIRECTORIES, - "metadata_authority": METADATA_AUTHORITIES, - "metadata_fetcher": METADATA_FETCHERS, - "origin": ORIGINS, - "origin_visit": ORIGIN_VISITS, - "origin_visit_status": ORIGIN_VISIT_STATUSES, - "raw_extrinsic_metadata": RAW_EXTRINSIC_METADATA, - "release": RELEASES, - "revision": REVISIONS, - "snapshot": SNAPSHOTS, - "skipped_content": SKIPPED_CONTENTS, -} diff --git a/swh/journal/tests/test_journal_data.py b/swh/journal/tests/test_journal_data.py deleted file mode 100644 --- a/swh/journal/tests/test_journal_data.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from swh.journal.tests.journal_data import TEST_OBJECTS - - -def test_ensure_visit_visit_status_date_consistency(): - """ensure origin-visit-status dates are more recent than their visit counterpart - - The origin-visit-status dates needs to be shifted slightly in the future from their - visit dates counterpart. Otherwise, we are hitting storage-wise the "on conflict" - ignore policy (because origin-visit-add creates an origin-visit-status with the same - parameters from the origin-visit {origin, visit, date}... - - """ - visits = TEST_OBJECTS["origin_visit"] - visit_statuses = TEST_OBJECTS["origin_visit_status"] - for visit, visit_status in zip(visits, visit_statuses): - assert visit.origin == visit_status.origin - assert visit.visit == visit_status.visit - assert visit.date < visit_status.date diff --git a/swh/journal/tests/test_kafka_writer.py b/swh/journal/tests/test_kafka_writer.py --- a/swh/journal/tests/test_kafka_writer.py +++ b/swh/journal/tests/test_kafka_writer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,10 +9,10 @@ import pytest from swh.journal.pytest_plugin import assert_all_objects_consumed, consume_messages -from swh.journal.tests.journal_data import TEST_OBJECTS from swh.journal.writer import model_object_dict_sanitizer from swh.journal.writer.kafka import KafkaDeliveryError, KafkaJournalWriter from swh.model.model import BaseModel, Directory, Release, Revision +from swh.model.tests.swh_model_data import TEST_OBJECTS def test_kafka_writer(