Changeset View
Changeset View
Standalone View
Standalone View
swh/journal/tests/test_replay.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import functools | import functools | ||||
import random | import random | ||||
from subprocess import Popen | from subprocess import Popen | ||||
from typing import Tuple | from typing import Tuple | ||||
import dateutil | import dateutil | ||||
from confluent_kafka import Producer | from confluent_kafka import Producer | ||||
from hypothesis import strategies, given, settings | from hypothesis import strategies, given, settings | ||||
import pytest | |||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from swh.storage.in_memory import ENABLE_ORIGIN_IDS | |||||
from swh.journal.client import JournalClient | from swh.journal.client import JournalClient | ||||
from swh.journal.serializers import key_to_kafka, value_to_kafka | from swh.journal.serializers import key_to_kafka, value_to_kafka | ||||
from swh.journal.replay import process_replay_objects, is_hash_in_bytearray | from swh.journal.replay import process_replay_objects, is_hash_in_bytearray | ||||
from .conftest import OBJECT_TYPE_KEYS | from .conftest import OBJECT_TYPE_KEYS | ||||
from .utils import MockedJournalClient, MockedKafkaWriter | from .utils import MockedJournalClient, MockedKafkaWriter | ||||
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines | assert OBJECT_TYPE_KEYS['release'][1] == \ | ||||
list(storage.release_get( | list(storage.release_get( | ||||
[rel['id'] for rel in OBJECT_TYPE_KEYS['release'][1]])) | [rel['id'] for rel in OBJECT_TYPE_KEYS['release'][1]])) | ||||
origins = list(storage.origin_get( | origins = list(storage.origin_get( | ||||
[orig for orig in OBJECT_TYPE_KEYS['origin'][1]])) | [orig for orig in OBJECT_TYPE_KEYS['origin'][1]])) | ||||
assert OBJECT_TYPE_KEYS['origin'][1] == \ | assert OBJECT_TYPE_KEYS['origin'][1] == \ | ||||
[{'url': orig['url']} for orig in origins] | [{'url': orig['url']} for orig in origins] | ||||
for origin in origins: | for origin in origins: | ||||
origin_id_or_url = \ | origin_url = origin['url'] | ||||
origin['id'] if ENABLE_ORIGIN_IDS else origin['url'] | |||||
expected_visits = [ | expected_visits = [ | ||||
{ | { | ||||
**visit, | **visit, | ||||
'origin': origin_id_or_url, | 'origin': origin_url, | ||||
'date': dateutil.parser.parse(visit['date']), | 'date': dateutil.parser.parse(visit['date']), | ||||
} | } | ||||
for visit in OBJECT_TYPE_KEYS['origin_visit'][1] | for visit in OBJECT_TYPE_KEYS['origin_visit'][1] | ||||
if visit['origin']['url'] == origin['url'] | if visit['origin'] == origin['url'] | ||||
] | ] | ||||
actual_visits = list(storage.origin_visit_get( | actual_visits = list(storage.origin_visit_get( | ||||
origin_id_or_url)) | origin_url)) | ||||
for visit in actual_visits: | for visit in actual_visits: | ||||
del visit['visit'] # opaque identifier | del visit['visit'] # opaque identifier | ||||
assert expected_visits == actual_visits | assert expected_visits == actual_visits | ||||
contents = list(storage.content_get_metadata( | contents = list(storage.content_get_metadata( | ||||
[cont['sha1'] for cont in OBJECT_TYPE_KEYS['content'][1]])) | [cont['sha1'] for cont in OBJECT_TYPE_KEYS['content'][1]])) | ||||
assert None not in contents | assert None not in contents | ||||
assert contents == OBJECT_TYPE_KEYS['content'][1] | assert contents == OBJECT_TYPE_KEYS['content'][1] | ||||
Show All 34 Lines | def _test_write_replay_origin_visit(visits): | ||||
actual_visits = list(storage.origin_visit_get('http://example.com/')) | actual_visits = list(storage.origin_visit_get('http://example.com/')) | ||||
assert len(actual_visits) == len(visits), actual_visits | assert len(actual_visits) == len(visits), actual_visits | ||||
for vin, vout in zip(visits, actual_visits): | for vin, vout in zip(visits, actual_visits): | ||||
vin = vin.copy() | vin = vin.copy() | ||||
vout = vout.copy() | vout = vout.copy() | ||||
if ENABLE_ORIGIN_IDS: | |||||
assert vout.pop('origin') == 1 | |||||
else: | |||||
assert vout.pop('origin') == 'http://example.com/' | assert vout.pop('origin') == 'http://example.com/' | ||||
vin.pop('origin') | vin.pop('origin') | ||||
vin.setdefault('type', 'git') | vin.setdefault('type', 'git') | ||||
vin.setdefault('metadata', None) | vin.setdefault('metadata', None) | ||||
assert vin == vout | assert vin == vout | ||||
def test_write_replay_legacy_origin_visit1(): | def test_write_replay_origin_visit(): | ||||
"""Test origin_visit when the 'origin' is just a string.""" | """Test origin_visit when the 'origin' is just a string.""" | ||||
now = datetime.datetime.now() | now = datetime.datetime.now() | ||||
visits = [{ | visits = [{ | ||||
'visit': 1, | 'visit': 1, | ||||
'origin': 'http://example.com/', | 'origin': 'http://example.com/', | ||||
'date': now, | 'date': now, | ||||
'type': 'git', | 'type': 'git', | ||||
'status': 'partial', | 'status': 'partial', | ||||
'snapshot': None, | 'snapshot': None, | ||||
}] | }] | ||||
_test_write_replay_origin_visit(visits) | _test_write_replay_origin_visit(visits) | ||||
def test_write_replay_legacy_origin_visit1(): | |||||
"""Test origin_visit when there is no type.""" | |||||
now = datetime.datetime.now() | |||||
visits = [{ | |||||
'visit': 1, | |||||
'origin': 'http://example.com/', | |||||
'date': now, | |||||
'status': 'partial', | |||||
'snapshot': None, | |||||
}] | |||||
with pytest.raises(ValueError, match='too old'): | |||||
_test_write_replay_origin_visit(visits) | |||||
def test_write_replay_legacy_origin_visit2(): | def test_write_replay_legacy_origin_visit2(): | ||||
"""Test origin_visit when 'type' is missing.""" | """Test origin_visit when 'type' is missing from the visit, but not | ||||
from the origin.""" | |||||
now = datetime.datetime.now() | now = datetime.datetime.now() | ||||
visits = [{ | visits = [{ | ||||
'visit': 1, | 'visit': 1, | ||||
'origin': { | 'origin': { | ||||
'url': 'http://example.com/', | 'url': 'http://example.com/', | ||||
'type': 'git', | 'type': 'git', | ||||
}, | }, | ||||
'date': now, | 'date': now, | ||||
'type': 'git', | 'type': 'git', | ||||
'status': 'partial', | 'status': 'partial', | ||||
'snapshot': None, | 'snapshot': None, | ||||
}] | }] | ||||
_test_write_replay_origin_visit(visits) | _test_write_replay_origin_visit(visits) | ||||
def test_write_replay_legacy_origin_visit3(): | |||||
"""Test origin_visit when the origin is a dict""" | |||||
now = datetime.datetime.now() | |||||
visits = [{ | |||||
'visit': 1, | |||||
'origin': { | |||||
'url': 'http://example.com/', | |||||
}, | |||||
'date': now, | |||||
'type': 'git', | |||||
'status': 'partial', | |||||
'snapshot': None, | |||||
}] | |||||
_test_write_replay_origin_visit(visits) | |||||
hash_strategy = strategies.binary(min_size=20, max_size=20) | hash_strategy = strategies.binary(min_size=20, max_size=20) | ||||
@settings(max_examples=500) | @settings(max_examples=500) | ||||
@given(strategies.sets(hash_strategy, min_size=0, max_size=500), | @given(strategies.sets(hash_strategy, min_size=0, max_size=500), | ||||
strategies.sets(hash_strategy, min_size=10)) | strategies.sets(hash_strategy, min_size=10)) | ||||
def test_is_hash_in_bytearray(haystack, needles): | def test_is_hash_in_bytearray(haystack, needles): | ||||
array = b''.join(sorted(haystack)) | array = b''.join(sorted(haystack)) | ||||
needles |= haystack # Exhaustively test for all objects in the array | needles |= haystack # Exhaustively test for all objects in the array | ||||
for needle in needles: | for needle in needles: | ||||
assert is_hash_in_bytearray(needle, array, len(haystack)) == \ | assert is_hash_in_bytearray(needle, array, len(haystack)) == \ | ||||
(needle in haystack) | (needle in haystack) |