diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ swh.core[db,http] >= 0.0.60 swh.model >= 0.0.40 -swh.storage >= 0.0.147 +swh.storage >= 0.0.156 diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -89,16 +89,18 @@ good_visits = [] for visit in visits: visit = visit.copy() - if isinstance(visit['origin'], str): - # note that it will crash with the pg and - # in-mem storages if the origin is not already known, - # but there is no other choice because we can't add an - # origin without knowing its type. Non-pg storages - # don't use a numeric FK internally, - visit['origin'] = {'url': visit['origin']} - else: - if 'type' not in visit: + if 'type' not in visit: + if isinstance(visit['origin'], dict) and 'type' in visit['origin']: + # Very old version of the schema: visits did not have a type, + # but their 'origin' field was a dict with a 'type' key. visit['type'] = visit['origin']['type'] + else: + # Very very old version of the schema: 'type' is missing, + # so there is nothing we can do to fix it. + raise ValueError('Got an origin_visit too old to be replayed.') + if isinstance(visit['origin'], dict): + # Old version of the schema: visit['origin'] was a dict. + visit['origin'] = visit['origin']['url'] good_visits.append(visit) return good_visits @@ -183,17 +185,20 @@ [] - `visit['origin']` is an URL instead of a dict: + `visit['origin']` is a dict instead of an URL: - >>> fix_objects('origin_visit', [{'origin': 'http://foo'}]) - [{'origin': {'url': 'http://foo'}}] + >>> pprint(fix_objects('origin_visit', [{ + ... 'origin': {'url': 'http://foo'}, + ... 'type': 'git', + ... }])) + [{'origin': 'http://foo', 'type': 'git'}] `visit['type']` is missing , but `origin['visit']['type']` exists: - >>> pprint(fix_objects( - ... 'origin_visit', - ... [{'origin': {'type': 'hg', 'url': 'http://foo'}}])) - [{'origin': {'type': 'hg', 'url': 'http://foo'}, 'type': 'hg'}] + >>> pprint(fix_objects('origin_visit', [ + ... {'origin': {'type': 'hg', 'url': 'http://foo'} + ... }])) + [{'origin': 'http://foo', 'type': 'hg'}] """ # noqa if object_type == 'revision': @@ -220,7 +225,7 @@ method(objects) elif object_type == 'origin_visit': for visit in objects: - storage.origin_add_one(visit['origin']) + storage.origin_add_one({'url': visit['origin']}) if 'metadata' not in visit: visit['metadata'] = None storage.origin_visit_upsert(objects) diff --git a/swh/journal/tests/conftest.py b/swh/journal/tests/conftest.py --- a/swh/journal/tests/conftest.py +++ b/swh/journal/tests/conftest.py @@ -128,7 +128,7 @@ ORIGIN_VISITS = [ { - 'origin': ORIGINS[0], + 'origin': ORIGINS[0]['url'], 'date': '2013-05-07 04:20:39.369271+00:00', 'snapshot': None, # TODO 'status': 'ongoing', # TODO @@ -136,7 +136,7 @@ 'type': 'git', }, { - 'origin': ORIGINS[0], + 'origin': ORIGINS[0]['url'], 'date': '2018-11-27 17:20:39+00:00', 'snapshot': None, # TODO 'status': 'ongoing', # TODO diff --git a/swh/journal/tests/test_kafka_writer.py b/swh/journal/tests/test_kafka_writer.py --- a/swh/journal/tests/test_kafka_writer.py +++ b/swh/journal/tests/test_kafka_writer.py @@ -123,12 +123,13 @@ elif object_type in ('origin_visit',): for object_ in objects: object_ = object_.copy() - origin_id = storage.origin_add_one(object_.pop('origin')) - visit = method(origin=origin_id, date=object_.pop('date'), + origin_url = object_.pop('origin') + storage.origin_add_one({'url': origin_url}) + visit = method(origin=origin_url, date=object_.pop('date'), type=object_.pop('type')) expected_messages += 1 visit_id = visit['visit'] - storage.origin_visit_update(origin_id, visit_id, **object_) + storage.origin_visit_update(origin_url, visit_id, **object_) expected_messages += 1 else: assert False, object_type diff --git a/swh/journal/tests/test_replay.py b/swh/journal/tests/test_replay.py --- a/swh/journal/tests/test_replay.py +++ b/swh/journal/tests/test_replay.py @@ -12,9 +12,9 @@ import dateutil from confluent_kafka import Producer from hypothesis import strategies, given, settings +import pytest from swh.storage import get_storage -from swh.storage.in_memory import ENABLE_ORIGIN_IDS from swh.journal.client import JournalClient from swh.journal.serializers import key_to_kafka, value_to_kafka @@ -88,19 +88,18 @@ assert OBJECT_TYPE_KEYS['origin'][1] == \ [{'url': orig['url']} for orig in origins] for origin in origins: - origin_id_or_url = \ - origin['id'] if ENABLE_ORIGIN_IDS else origin['url'] + origin_url = origin['url'] expected_visits = [ { **visit, - 'origin': origin_id_or_url, + 'origin': origin_url, 'date': dateutil.parser.parse(visit['date']), } for visit in OBJECT_TYPE_KEYS['origin_visit'][1] - if visit['origin']['url'] == origin['url'] + if visit['origin'] == origin['url'] ] actual_visits = list(storage.origin_visit_get( - origin_id_or_url)) + origin_url)) for visit in actual_visits: del visit['visit'] # opaque identifier assert expected_visits == actual_visits @@ -151,17 +150,14 @@ for vin, vout in zip(visits, actual_visits): vin = vin.copy() vout = vout.copy() - if ENABLE_ORIGIN_IDS: - assert vout.pop('origin') == 1 - else: - assert vout.pop('origin') == 'http://example.com/' + assert vout.pop('origin') == 'http://example.com/' vin.pop('origin') vin.setdefault('type', 'git') vin.setdefault('metadata', None) assert vin == vout -def test_write_replay_legacy_origin_visit1(): +def test_write_replay_origin_visit(): """Test origin_visit when the 'origin' is just a string.""" now = datetime.datetime.now() visits = [{ @@ -175,8 +171,23 @@ _test_write_replay_origin_visit(visits) +def test_write_replay_legacy_origin_visit1(): + """Test origin_visit when there is no type.""" + now = datetime.datetime.now() + visits = [{ + 'visit': 1, + 'origin': 'http://example.com/', + 'date': now, + 'status': 'partial', + 'snapshot': None, + }] + with pytest.raises(ValueError, match='too old'): + _test_write_replay_origin_visit(visits) + + def test_write_replay_legacy_origin_visit2(): - """Test origin_visit when 'type' is missing.""" + """Test origin_visit when 'type' is missing from the visit, but not + from the origin.""" now = datetime.datetime.now() visits = [{ 'visit': 1, @@ -192,6 +203,22 @@ _test_write_replay_origin_visit(visits) +def test_write_replay_legacy_origin_visit3(): + """Test origin_visit when the origin is a dict""" + now = datetime.datetime.now() + visits = [{ + 'visit': 1, + 'origin': { + 'url': 'http://example.com/', + }, + 'date': now, + 'type': 'git', + 'status': 'partial', + 'snapshot': None, + }] + _test_write_replay_origin_visit(visits) + + hash_strategy = strategies.binary(min_size=20, max_size=20) diff --git a/swh/journal/tests/test_write_replay.py b/swh/journal/tests/test_write_replay.py --- a/swh/journal/tests/test_write_replay.py +++ b/swh/journal/tests/test_write_replay.py @@ -30,7 +30,7 @@ for (obj_type, obj) in objects: obj = obj.copy() if obj_type == 'origin_visit': - storage1.origin_add_one(obj['origin']) + storage1.origin_add_one({'url': obj['origin']}) storage1.origin_visit_upsert([obj]) else: method = getattr(storage1, obj_type + '_add') diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist=flake8,mypy,py3-no-origin-ids,py3 +envlist=flake8,mypy,py3 [testenv:py3] passenv=SWH_KAFKA_ROOT @@ -14,19 +14,6 @@ --cov-branch \ --doctest-modules {posargs} -[testenv:py3-no-origin-ids] -passenv=SWH_KAFKA_ROOT -deps = - .[testing] - pytest-cov -setenv = - SWH_STORAGE_IN_MEMORY_ENABLE_ORIGIN_IDS=false - SWH_KAFKA_ROOT = {env:SWH_KAFKA_ROOT:swh/journal/tests/kafka} -commands = - pytest --cov={envsitepackagesdir}/swh/journal \ - {envsitepackagesdir}/swh/journal \ - --cov-branch --doctest-modules {posargs} - [testenv:flake8] skip_install = true deps =