Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/tests/test_loader.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import hashlib | import hashlib | ||||
import logging | |||||
import string | import string | ||||
from unittest.mock import Mock, call, patch | from unittest.mock import Mock, call, patch | ||||
import attr | import attr | ||||
import pytest | import pytest | ||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import BasePackageInfo, PackageLoader | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
ExtID, | |||||
Origin, | Origin, | ||||
OriginVisit, | OriginVisit, | ||||
OriginVisitStatus, | OriginVisitStatus, | ||||
Person, | |||||
Release, | |||||
Revision, | |||||
RevisionType, | |||||
Snapshot, | Snapshot, | ||||
SnapshotBranch, | SnapshotBranch, | ||||
TargetType, | TargetType, | ||||
TimestampWithTimezone, | |||||
) | ) | ||||
from swh.model.model import ExtID | |||||
from swh.model.model import ObjectType as ModelObjectType | |||||
from swh.model.swhids import CoreSWHID, ObjectType | from swh.model.swhids import CoreSWHID, ObjectType | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from swh.storage.algos.snapshot import snapshot_get_latest | from swh.storage.algos.snapshot import snapshot_get_latest | ||||
class FakeStorage: | class FakeStorage: | ||||
def origin_add(self, origins): | def origin_add(self, origins): | ||||
raise ValueError("We refuse to add an origin") | raise ValueError("We refuse to add an origin") | ||||
Show All 26 Lines | def get_package_info(self, version): | ||||
patch.object( | patch.object( | ||||
p_info, | p_info, | ||||
"extid", | "extid", | ||||
return_value=(extid_type, f"extid-of-{version}".encode()), | return_value=(extid_type, f"extid-of-{version}".encode()), | ||||
autospec=True, | autospec=True, | ||||
).start() | ).start() | ||||
yield (f"branch-{version}", p_info) | yield (f"branch-{version}", p_info) | ||||
def _load_revision(self, p_info, origin): | def _load_release(self, version, p_info, origin): | ||||
return None | return None | ||||
def test_loader_origin_visit_failure(swh_storage): | def test_loader_origin_visit_failure(swh_storage): | ||||
"""Failure to add origin or origin visit should failed immediately | """Failure to add origin or origin visit should failed immediately | ||||
""" | """ | ||||
loader = PackageLoader(swh_storage, "some-url") | loader = PackageLoader(swh_storage, "some-url") | ||||
loader.storage = FakeStorage() | loader.storage = FakeStorage() | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status == {"status": "failed"} | assert actual_load_status == {"status": "failed"} | ||||
loader.storage = FakeStorage2() | loader.storage = FakeStorage2() | ||||
actual_load_status2 = loader.load() | actual_load_status2 = loader.load() | ||||
assert actual_load_status2 == {"status": "failed"} | assert actual_load_status2 == {"status": "failed"} | ||||
def test_resolve_revision_from_extids() -> None: | def test_resolve_object_from_extids() -> None: | ||||
loader = PackageLoader(None, None) # type: ignore | loader = PackageLoader(None, None) # type: ignore | ||||
p_info = Mock(wraps=BasePackageInfo(None, None)) # type: ignore | p_info = Mock(wraps=BasePackageInfo(None, None)) # type: ignore | ||||
# The PackageInfo does not support extids | # The PackageInfo does not support extids | ||||
p_info.extid.return_value = None | p_info.extid.return_value = None | ||||
known_extids = { | known_extids = { | ||||
("extid-type", b"extid-of-aaaa"): [ | ("extid-type", b"extid-of-aaaa"): [ | ||||
CoreSWHID(object_type=ObjectType.REVISION, object_id=b"a" * 20), | CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"a" * 20), | ||||
] | ] | ||||
} | } | ||||
revision_whitelist = {b"unused"} | whitelist = {b"unused"} | ||||
assert ( | assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None | ||||
loader.resolve_revision_from_extids(known_extids, p_info, revision_whitelist) | |||||
is None | |||||
) | |||||
# Some known extid, and the PackageInfo is not one of them (ie. cache miss) | # Some known extid, and the PackageInfo is not one of them (ie. cache miss) | ||||
p_info.extid.return_value = ("extid-type", b"extid-of-cccc") | p_info.extid.return_value = ("extid-type", b"extid-of-cccc") | ||||
assert ( | assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None | ||||
loader.resolve_revision_from_extids(known_extids, p_info, revision_whitelist) | |||||
is None | |||||
) | |||||
# Some known extid, and the PackageInfo is one of them (ie. cache hit), | # Some known extid, and the PackageInfo is one of them (ie. cache hit), | ||||
# but the target revision was not in the previous snapshot | # but the target release was not in the previous snapshot | ||||
p_info.extid.return_value = ("extid-type", b"extid-of-aaaa") | p_info.extid.return_value = ("extid-type", b"extid-of-aaaa") | ||||
assert ( | assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None | ||||
loader.resolve_revision_from_extids(known_extids, p_info, revision_whitelist) | |||||
is None | |||||
) | |||||
# Some known extid, and the PackageInfo is one of them (ie. cache hit), | # Some known extid, and the PackageInfo is one of them (ie. cache hit), | ||||
# and the target revision was in the previous snapshot | # and the target release was in the previous snapshot | ||||
revision_whitelist = {b"a" * 20} | whitelist = {b"a" * 20} | ||||
assert ( | assert loader.resolve_object_from_extids( | ||||
loader.resolve_revision_from_extids(known_extids, p_info, revision_whitelist) | known_extids, p_info, whitelist | ||||
== b"a" * 20 | ) == CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"a" * 20) | ||||
) | |||||
# Same as before, but there is more than one extid, and only one is an allowed | # Same as before, but there is more than one extid, and only one is an allowed | ||||
# revision | # release | ||||
revision_whitelist = {b"a" * 20} | whitelist = {b"a" * 20} | ||||
known_extids = { | known_extids = { | ||||
("extid-type", b"extid-of-aaaa"): [ | ("extid-type", b"extid-of-aaaa"): [ | ||||
CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20), | CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"b" * 20), | ||||
CoreSWHID(object_type=ObjectType.REVISION, object_id=b"a" * 20), | CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"a" * 20), | ||||
] | ] | ||||
} | } | ||||
assert ( | assert loader.resolve_object_from_extids( | ||||
loader.resolve_revision_from_extids(known_extids, p_info, revision_whitelist) | known_extids, p_info, whitelist | ||||
== b"a" * 20 | ) == CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"a" * 20) | ||||
) | |||||
def test_load_get_known_extids() -> None: | def test_load_get_known_extids() -> None: | ||||
"""Checks PackageLoader.load() fetches known extids efficiently""" | """Checks PackageLoader.load() fetches known extids efficiently""" | ||||
storage = Mock(wraps=get_storage("memory")) | storage = Mock(wraps=get_storage("memory")) | ||||
loader = StubPackageLoader(storage, "http://example.org") | loader = StubPackageLoader(storage, "http://example.org") | ||||
Show All 10 Lines | |||||
def test_load_extids() -> None: | def test_load_extids() -> None: | ||||
"""Checks PackageLoader.load() skips iff it should, and writes (only) | """Checks PackageLoader.load() skips iff it should, and writes (only) | ||||
the new ExtIDs""" | the new ExtIDs""" | ||||
storage = get_storage("memory") | storage = get_storage("memory") | ||||
origin = "http://example.org" | origin = "http://example.org" | ||||
rev1_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"a" * 20) | rel1_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"a" * 20) | ||||
rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) | rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"b" * 20) | ||||
rev3_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"c" * 20) | rel3_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) | ||||
rev4_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"d" * 20) | rel4_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"d" * 20) | ||||
dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) | dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) | ||||
# Results of a previous load | # Results of a previous load | ||||
storage.extid_add( | storage.extid_add( | ||||
[ | [ | ||||
ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid), | ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), | ||||
ExtID("extid-type2", b"extid-of-v2.0", rev2_swhid), | ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), | ||||
] | ] | ||||
) | ) | ||||
last_snapshot = Snapshot( | last_snapshot = Snapshot( | ||||
branches={ | branches={ | ||||
b"v1.0": SnapshotBranch( | b"v1.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev1_swhid.object_id | target_type=TargetType.RELEASE, target=rel1_swhid.object_id | ||||
), | ), | ||||
b"v2.0": SnapshotBranch( | b"v2.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev2_swhid.object_id | target_type=TargetType.RELEASE, target=rel2_swhid.object_id | ||||
), | ), | ||||
b"v3.0": SnapshotBranch( | b"v3.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev3_swhid.object_id | target_type=TargetType.RELEASE, target=rel3_swhid.object_id | ||||
), | ), | ||||
} | } | ||||
) | ) | ||||
storage.snapshot_add([last_snapshot]) | storage.snapshot_add([last_snapshot]) | ||||
date = datetime.datetime.now(tz=datetime.timezone.utc) | date = datetime.datetime.now(tz=datetime.timezone.utc) | ||||
storage.origin_add([Origin(url=origin)]) | storage.origin_add([Origin(url=origin)]) | ||||
storage.origin_visit_add( | storage.origin_visit_add( | ||||
[OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] | [OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] | ||||
) | ) | ||||
storage.origin_visit_status_add( | storage.origin_visit_status_add( | ||||
[ | [ | ||||
OriginVisitStatus( | OriginVisitStatus( | ||||
origin=origin, | origin=origin, | ||||
visit=1, | visit=1, | ||||
status="full", | status="full", | ||||
date=date, | date=date, | ||||
snapshot=last_snapshot.id, | snapshot=last_snapshot.id, | ||||
) | ) | ||||
] | ] | ||||
) | ) | ||||
loader = StubPackageLoader(storage, "http://example.org") | loader = StubPackageLoader(storage, "http://example.org") | ||||
patch.object( | patch.object( | ||||
loader, | loader, | ||||
"_load_revision", | "_load_release", | ||||
return_value=(rev4_swhid.object_id, dir_swhid.object_id), | return_value=(rel4_swhid.object_id, dir_swhid.object_id), | ||||
autospec=True, | autospec=True, | ||||
).start() | ).start() | ||||
loader.load() | loader.load() | ||||
assert loader._load_revision.mock_calls == [ # type: ignore | assert loader._load_release.mock_calls == [ # type: ignore | ||||
# v1.0: not loaded because there is already its (extid_type, extid, rev) | # v1.0: not loaded because there is already its (extid_type, extid, rel) | ||||
# in the storage. | # in the storage. | ||||
# v2.0: loaded, because there is already a similar extid, but different type | # v2.0: loaded, because there is already a similar extid, but different type | ||||
call(StubPackageInfo(origin, "example-v2.0.tar"), Origin(url=origin)), | call("v2.0", StubPackageInfo(origin, "example-v2.0.tar"), Origin(url=origin)), | ||||
# v3.0: loaded despite having an (extid_type, extid) in storage, because | # v3.0: loaded despite having an (extid_type, extid) in storage, because | ||||
# the target of the extid is not in the previous snapshot | # the target of the extid is not in the previous snapshot | ||||
call(StubPackageInfo(origin, "example-v3.0.tar"), Origin(url=origin)), | call("v3.0", StubPackageInfo(origin, "example-v3.0.tar"), Origin(url=origin)), | ||||
# v4.0: loaded, because there isn't its extid | # v4.0: loaded, because there isn't its extid | ||||
call(StubPackageInfo(origin, "example-v4.0.tar"), Origin(url=origin)), | call("v4.0", StubPackageInfo(origin, "example-v4.0.tar"), Origin(url=origin)), | ||||
] | ] | ||||
# then check the snapshot has all the branches. | # then check the snapshot has all the branches. | ||||
# versions 2.0 to 4.0 all point to rev4_swhid (instead of the value of the last | # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last | ||||
# snapshot), because they had to be loaded (mismatched extid), and the mocked | # snapshot), because they had to be loaded (mismatched extid), and the mocked | ||||
# _load_revision always returns rev4_swhid. | # _load_release always returns rel4_swhid. | ||||
snapshot = Snapshot( | snapshot = Snapshot( | ||||
branches={ | branches={ | ||||
b"branch-v1.0": SnapshotBranch( | b"branch-v1.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev1_swhid.object_id | target_type=TargetType.RELEASE, target=rel1_swhid.object_id | ||||
), | ), | ||||
b"branch-v2.0": SnapshotBranch( | b"branch-v2.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev4_swhid.object_id | target_type=TargetType.RELEASE, target=rel4_swhid.object_id | ||||
), | ), | ||||
b"branch-v3.0": SnapshotBranch( | b"branch-v3.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev4_swhid.object_id | target_type=TargetType.RELEASE, target=rel4_swhid.object_id | ||||
), | ), | ||||
b"branch-v4.0": SnapshotBranch( | b"branch-v4.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, target=rev4_swhid.object_id | target_type=TargetType.RELEASE, target=rel4_swhid.object_id | ||||
), | ), | ||||
} | } | ||||
) | ) | ||||
assert snapshot_get_latest(storage, origin) == snapshot | assert snapshot_get_latest(storage, origin) == snapshot | ||||
extids = storage.extid_get_from_target( | extids = storage.extid_get_from_target( | ||||
ObjectType.REVISION, | ObjectType.RELEASE, | ||||
[ | [ | ||||
rev1_swhid.object_id, | rel1_swhid.object_id, | ||||
rev2_swhid.object_id, | rel2_swhid.object_id, | ||||
rev3_swhid.object_id, | rel3_swhid.object_id, | ||||
rev4_swhid.object_id, | rel4_swhid.object_id, | ||||
], | ], | ||||
) | ) | ||||
assert set(extids) == { | assert set(extids) == { | ||||
# What we inserted at the beginning of the test: | # What we inserted at the beginning of the test: | ||||
ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid), | ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), | ||||
ExtID("extid-type2", b"extid-of-v2.0", rev2_swhid), | ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), | ||||
# Added by the loader: | # Added by the loader: | ||||
ExtID("extid-type1", b"extid-of-v2.0", rev4_swhid), | ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid), | ||||
ExtID("extid-type2", b"extid-of-v3.0", rev4_swhid), | ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid), | ||||
ExtID("extid-type2", b"extid-of-v4.0", rev4_swhid), | ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid), | ||||
} | |||||
def test_load_upgrade_from_revision_extids(caplog): | |||||
"""Tests that, when loading incrementally based on a snapshot made by an old | |||||
version of the loader, the loader will convert revisions to releases | |||||
and add them to the storage. | |||||
Also checks that, if an extid exists pointing to a non-existent revision | |||||
(which should never happen, but you never know...), the release is loaded from | |||||
scratch.""" | |||||
storage = get_storage("memory") | |||||
origin = "http://example.org" | |||||
dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"d" * 20) | |||||
dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) | |||||
date = TimestampWithTimezone.from_datetime( | |||||
datetime.datetime.now(tz=datetime.timezone.utc) | |||||
) | |||||
person = Person.from_fullname(b"Jane Doe <jdoe@example.org>") | |||||
rev1 = Revision( | |||||
message=b"blah", | |||||
author=person, | |||||
date=date, | |||||
committer=person, | |||||
committer_date=date, | |||||
directory=dir1_swhid.object_id, | |||||
type=RevisionType.TAR, | |||||
synthetic=True, | |||||
) | |||||
rel1 = Release( | |||||
name=b"v1.0", | |||||
message=b"blah", | |||||
author=person, | |||||
date=date, | |||||
target=dir1_swhid.object_id, | |||||
target_type=ModelObjectType.DIRECTORY, | |||||
synthetic=True, | |||||
) | |||||
rev1_swhid = rev1.swhid() | |||||
rel1_swhid = rel1.swhid() | |||||
rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) | |||||
rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) | |||||
# Results of a previous load | |||||
storage.extid_add( | |||||
[ | |||||
ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid), | |||||
ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid), | |||||
] | |||||
) | |||||
storage.revision_add([rev1]) | |||||
last_snapshot = Snapshot( | |||||
branches={ | |||||
b"v1.0": SnapshotBranch( | |||||
target_type=TargetType.REVISION, target=rev1_swhid.object_id | |||||
), | |||||
b"v2.0": SnapshotBranch( | |||||
target_type=TargetType.REVISION, target=rev2_swhid.object_id | |||||
), | |||||
} | |||||
) | |||||
storage.snapshot_add([last_snapshot]) | |||||
date = datetime.datetime.now(tz=datetime.timezone.utc) | |||||
storage.origin_add([Origin(url=origin)]) | |||||
storage.origin_visit_add( | |||||
[OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] | |||||
) | |||||
storage.origin_visit_status_add( | |||||
[ | |||||
OriginVisitStatus( | |||||
origin=origin, | |||||
visit=1, | |||||
status="full", | |||||
date=date, | |||||
snapshot=last_snapshot.id, | |||||
) | |||||
] | |||||
) | |||||
loader = StubPackageLoader(storage, "http://example.org") | |||||
patch.object( | |||||
loader, | |||||
"_load_release", | |||||
return_value=(rel2_swhid.object_id, dir2_swhid.object_id), | |||||
autospec=True, | |||||
).start() | |||||
patch.object( | |||||
loader, "get_versions", return_value=["v1.0", "v2.0", "v3.0"], autospec=True, | |||||
).start() | |||||
caplog.set_level(logging.ERROR) | |||||
loader.load() | |||||
assert len(caplog.records) == 1 | |||||
(record,) = caplog.records | |||||
assert record.levelname == "ERROR" | |||||
assert "Failed to upgrade branch branch-v2.0" in record.message | |||||
assert loader._load_release.mock_calls == [ | |||||
# v1.0: not loaded because there is already a revision matching it | |||||
# v2.0: loaded, as the revision is missing from the storage even though there | |||||
# is an extid | |||||
call("v2.0", StubPackageInfo(origin, "example-v2.0.tar"), Origin(url=origin)), | |||||
# v3.0: loaded (did not exist yet) | |||||
call("v3.0", StubPackageInfo(origin, "example-v3.0.tar"), Origin(url=origin)), | |||||
] | |||||
snapshot = Snapshot( | |||||
branches={ | |||||
b"branch-v1.0": SnapshotBranch( | |||||
target_type=TargetType.RELEASE, target=rel1_swhid.object_id | |||||
), | |||||
b"branch-v2.0": SnapshotBranch( | |||||
target_type=TargetType.RELEASE, target=rel2_swhid.object_id | |||||
), | |||||
b"branch-v3.0": SnapshotBranch( | |||||
target_type=TargetType.RELEASE, target=rel2_swhid.object_id | |||||
), | |||||
} | |||||
) | |||||
assert snapshot_get_latest(storage, origin) == snapshot | |||||
extids = storage.extid_get_from_target( | |||||
ObjectType.RELEASE, [rel1_swhid.object_id, rel2_swhid.object_id,], | |||||
) | |||||
assert set(extids) == { | |||||
ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), | |||||
ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid), | |||||
ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid), | |||||
} | } | ||||
def test_manifest_extid(): | def test_manifest_extid(): | ||||
"""Compute primary key should return the right identity | """Compute primary key should return the right identity | ||||
""" | """ | ||||
Show All 39 Lines |