Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/archive/tests/test_archive.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import attr | import attr | ||||
from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo | from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo | ||||
from swh.loader.package.tests.common import check_metadata_paths | from swh.loader.package.tests.common import check_metadata_paths | ||||
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines | |||||
_expected_new_revisions_first_visit = { | _expected_new_revisions_first_visit = { | ||||
"44183488c0774ce3c957fa19ba695cf18a4a42b3": ( | "44183488c0774ce3c957fa19ba695cf18a4a42b3": ( | ||||
"3aebc29ed1fccc4a6f2f2010fb8e57882406b528" | "3aebc29ed1fccc4a6f2f2010fb8e57882406b528" | ||||
) | ) | ||||
} | } | ||||
def visit_with_no_artifact_found(swh_config, requests_mock_datadir): | def test_archive_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): | ||||
url = URL | url = URL | ||||
unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" | unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" | ||||
loader = ArchiveLoader( | loader = ArchiveLoader( | ||||
swh_storage, | |||||
url, | url, | ||||
artifacts=[ | artifacts=[ | ||||
{ | { | ||||
"time": 944729610, | "time": 944729610, | ||||
"url": unknown_artifact_url, # unknown artifact | "url": unknown_artifact_url, # unknown artifact | ||||
"length": 221837, | "length": 221837, | ||||
"filename": "8sync-0.1.0.tar.gz", | "filename": "8sync-0.1.0.tar.gz", | ||||
"version": "0.1.0", | "version": "0.1.0", | ||||
} | } | ||||
], | ], | ||||
) | ) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status["status"] == "uneventful" | assert actual_load_status["status"] == "uneventful" | ||||
assert actual_load_status["snapshot_id"] is not None | assert actual_load_status["snapshot_id"] is not None | ||||
stats = get_stats(loader.storage) | stats = get_stats(swh_storage) | ||||
assert { | assert { | ||||
"content": 0, | "content": 0, | ||||
"directory": 0, | "directory": 0, | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1, | "origin_visit": 1, | ||||
"release": 0, | "release": 0, | ||||
"revision": 0, | "revision": 0, | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
"snapshot": 1, | "snapshot": 1, | ||||
} == stats | } == stats | ||||
assert_last_visit_matches(loader.storage, url, status="partial", type="tar") | assert_last_visit_matches(swh_storage, url, status="partial", type="tar") | ||||
def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): | def test_archive_check_revision_metadata_structure(swh_storage, requests_mock_datadir): | ||||
loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) | loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status["status"] == "eventful" | assert actual_load_status["status"] == "eventful" | ||||
assert actual_load_status["snapshot_id"] is not None | assert actual_load_status["snapshot_id"] is not None | ||||
assert_last_visit_matches(loader.storage, URL, status="full", type="tar") | assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | ||||
expected_revision_id = hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3") | expected_revision_id = hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3") | ||||
revision = loader.storage.revision_get([expected_revision_id])[0] | revision = swh_storage.revision_get([expected_revision_id])[0] | ||||
assert revision is not None | assert revision is not None | ||||
check_metadata_paths( | check_metadata_paths( | ||||
revision.metadata, | revision.metadata, | ||||
paths=[ | paths=[ | ||||
("intrinsic", dict), | ("intrinsic", dict), | ||||
("extrinsic.provider", str), | ("extrinsic.provider", str), | ||||
("extrinsic.when", str), | ("extrinsic.when", str), | ||||
("extrinsic.raw", dict), | ("extrinsic.raw", dict), | ||||
("original_artifact", list), | ("original_artifact", list), | ||||
], | ], | ||||
) | ) | ||||
for original_artifact in revision.metadata["original_artifact"]: | for original_artifact in revision.metadata["original_artifact"]: | ||||
check_metadata_paths( | check_metadata_paths( | ||||
original_artifact, | original_artifact, | ||||
paths=[("filename", str), ("length", int), ("checksums", dict),], | paths=[("filename", str), ("length", int), ("checksums", dict),], | ||||
) | ) | ||||
def test_visit_with_release_artifact_no_prior_visit(swh_config, requests_mock_datadir): | def test_archive_visit_with_release_artifact_no_prior_visit( | ||||
swh_storage, requests_mock_datadir | |||||
): | |||||
"""With no prior visit, load a gnu project ends up with 1 snapshot | """With no prior visit, load a gnu project ends up with 1 snapshot | ||||
""" | """ | ||||
loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) | loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status["status"] == "eventful" | assert actual_load_status["status"] == "eventful" | ||||
expected_snapshot_first_visit_id = hash_to_bytes( | expected_snapshot_first_visit_id = hash_to_bytes( | ||||
"c419397fd912039825ebdbea378bc6283f006bf5" | "c419397fd912039825ebdbea378bc6283f006bf5" | ||||
) | ) | ||||
assert ( | assert ( | ||||
hash_to_bytes(actual_load_status["snapshot_id"]) | hash_to_bytes(actual_load_status["snapshot_id"]) | ||||
== expected_snapshot_first_visit_id | == expected_snapshot_first_visit_id | ||||
) | ) | ||||
assert_last_visit_matches(loader.storage, URL, status="full", type="tar") | assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | ||||
stats = get_stats(loader.storage) | stats = get_stats(swh_storage) | ||||
assert { | assert { | ||||
"content": len(_expected_new_contents_first_visit), | "content": len(_expected_new_contents_first_visit), | ||||
"directory": len(_expected_new_directories_first_visit), | "directory": len(_expected_new_directories_first_visit), | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1, | "origin_visit": 1, | ||||
"release": 0, | "release": 0, | ||||
"revision": len(_expected_new_revisions_first_visit), | "revision": len(_expected_new_revisions_first_visit), | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
"snapshot": 1, | "snapshot": 1, | ||||
} == stats | } == stats | ||||
expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) | expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) | ||||
assert list(loader.storage.content_missing_per_sha1(expected_contents)) == [] | assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] | ||||
expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) | expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) | ||||
assert list(loader.storage.directory_missing(expected_dirs)) == [] | assert list(swh_storage.directory_missing(expected_dirs)) == [] | ||||
expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) | expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) | ||||
assert list(loader.storage.revision_missing(expected_revs)) == [] | assert list(swh_storage.revision_missing(expected_revs)) == [] | ||||
expected_snapshot = Snapshot( | expected_snapshot = Snapshot( | ||||
id=expected_snapshot_first_visit_id, | id=expected_snapshot_first_visit_id, | ||||
branches={ | branches={ | ||||
b"HEAD": SnapshotBranch( | b"HEAD": SnapshotBranch( | ||||
target_type=TargetType.ALIAS, target=b"releases/0.1.0", | target_type=TargetType.ALIAS, target=b"releases/0.1.0", | ||||
), | ), | ||||
b"releases/0.1.0": SnapshotBranch( | b"releases/0.1.0": SnapshotBranch( | ||||
target_type=TargetType.REVISION, | target_type=TargetType.REVISION, | ||||
target=hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3"), | target=hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3"), | ||||
), | ), | ||||
}, | }, | ||||
) | ) | ||||
check_snapshot(expected_snapshot, loader.storage) | check_snapshot(expected_snapshot, swh_storage) | ||||
def test_2_visits_without_change(swh_config, requests_mock_datadir): | def test_archive_2_visits_without_change(swh_storage, requests_mock_datadir): | ||||
"""With no prior visit, load a gnu project ends up with 1 snapshot | """With no prior visit, load a gnu project ends up with 1 snapshot | ||||
""" | """ | ||||
url = URL | url = URL | ||||
loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) | loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status["status"] == "eventful" | assert actual_load_status["status"] == "eventful" | ||||
assert actual_load_status["snapshot_id"] is not None | assert actual_load_status["snapshot_id"] is not None | ||||
assert_last_visit_matches(loader.storage, url, status="full", type="tar") | assert_last_visit_matches(swh_storage, url, status="full", type="tar") | ||||
actual_load_status2 = loader.load() | actual_load_status2 = loader.load() | ||||
assert actual_load_status2["status"] == "uneventful" | assert actual_load_status2["status"] == "uneventful" | ||||
assert actual_load_status2["snapshot_id"] is not None | assert actual_load_status2["snapshot_id"] is not None | ||||
assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] | assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] | ||||
assert_last_visit_matches(loader.storage, url, status="full", type="tar") | assert_last_visit_matches(swh_storage, url, status="full", type="tar") | ||||
urls = [ | urls = [ | ||||
m.url | m.url | ||||
for m in requests_mock_datadir.request_history | for m in requests_mock_datadir.request_history | ||||
if m.url.startswith("https://ftp.gnu.org") | if m.url.startswith("https://ftp.gnu.org") | ||||
] | ] | ||||
assert len(urls) == 1 | assert len(urls) == 1 | ||||
def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir): | def test_archive_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): | ||||
"""With no prior visit, load a gnu project ends up with 1 snapshot | """With no prior visit, load a gnu project ends up with 1 snapshot | ||||
""" | """ | ||||
url = URL | url = URL | ||||
artifact1 = GNU_ARTIFACTS[0] | artifact1 = GNU_ARTIFACTS[0] | ||||
loader = ArchiveLoader(url, [artifact1]) | loader = ArchiveLoader(swh_storage, url, [artifact1]) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status["status"] == "eventful" | assert actual_load_status["status"] == "eventful" | ||||
assert actual_load_status["snapshot_id"] is not None | assert actual_load_status["snapshot_id"] is not None | ||||
assert_last_visit_matches(loader.storage, url, status="full", type="tar") | assert_last_visit_matches(swh_storage, url, status="full", type="tar") | ||||
stats = get_stats(loader.storage) | stats = get_stats(swh_storage) | ||||
assert { | assert { | ||||
"content": len(_expected_new_contents_first_visit), | "content": len(_expected_new_contents_first_visit), | ||||
"directory": len(_expected_new_directories_first_visit), | "directory": len(_expected_new_directories_first_visit), | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1, | "origin_visit": 1, | ||||
"release": 0, | "release": 0, | ||||
"revision": len(_expected_new_revisions_first_visit), | "revision": len(_expected_new_revisions_first_visit), | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
Show All 10 Lines | def test_archive_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): | ||||
artifact2 = { | artifact2 = { | ||||
"time": 1480991830, | "time": 1480991830, | ||||
"url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", | "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", | ||||
"length": 238466, | "length": 238466, | ||||
"filename": "8sync-0.2.0.tar.gz", | "filename": "8sync-0.2.0.tar.gz", | ||||
"version": "0.2.0", | "version": "0.2.0", | ||||
} | } | ||||
loader2 = ArchiveLoader(url, [artifact1, artifact2]) | loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2]) | ||||
# implementation detail: share the storage in between visits | stats2 = get_stats(swh_storage) | ||||
loader2.storage = loader.storage | |||||
stats2 = get_stats(loader2.storage) | |||||
assert stats == stats2 # ensure we share the storage | assert stats == stats2 # ensure we share the storage | ||||
actual_load_status2 = loader2.load() | actual_load_status2 = loader2.load() | ||||
assert actual_load_status2["status"] == "eventful" | assert actual_load_status2["status"] == "eventful" | ||||
assert actual_load_status2["snapshot_id"] is not None | assert actual_load_status2["snapshot_id"] is not None | ||||
stats2 = get_stats(loader.storage) | stats2 = get_stats(swh_storage) | ||||
assert { | assert { | ||||
"content": len(_expected_new_contents_first_visit) + 14, | "content": len(_expected_new_contents_first_visit) + 14, | ||||
"directory": len(_expected_new_directories_first_visit) + 8, | "directory": len(_expected_new_directories_first_visit) + 8, | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1 + 1, | "origin_visit": 1 + 1, | ||||
"release": 0, | "release": 0, | ||||
"revision": len(_expected_new_revisions_first_visit) + 1, | "revision": len(_expected_new_revisions_first_visit) + 1, | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
"snapshot": 1 + 1, | "snapshot": 1 + 1, | ||||
} == stats2 | } == stats2 | ||||
assert_last_visit_matches(loader.storage, url, status="full", type="tar") | assert_last_visit_matches(swh_storage, url, status="full", type="tar") | ||||
urls = [ | urls = [ | ||||
m.url | m.url | ||||
for m in requests_mock_datadir.request_history | for m in requests_mock_datadir.request_history | ||||
if m.url.startswith("https://ftp.gnu.org") | if m.url.startswith("https://ftp.gnu.org") | ||||
] | ] | ||||
# 1 artifact (2nd time no modification) + 1 new artifact | # 1 artifact (2nd time no modification) + 1 new artifact | ||||
assert len(urls) == 2 | assert len(urls) == 2 | ||||
def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): | def test_archive_2_visits_without_change_not_gnu(swh_storage, requests_mock_datadir): | ||||
"""Load a project archive (not gnu) ends up with 1 snapshot | """Load a project archive (not gnu) ends up with 1 snapshot | ||||
""" | """ | ||||
url = "https://something.else.org/8sync/" | url = "https://something.else.org/8sync/" | ||||
artifacts = [ # this is not a gnu artifact | artifacts = [ # this is not a gnu artifact | ||||
{ | { | ||||
"time": "1999-12-09T09:53:30+00:00", # it's also not a timestamp | "time": "1999-12-09T09:53:30+00:00", # it's also not a timestamp | ||||
"sha256": "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4", # noqa | "sha256": "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4", # noqa | ||||
# keep a gnu artifact reference to avoid adding other test files | # keep a gnu artifact reference to avoid adding other test files | ||||
"url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", | "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", | ||||
"length": 238466, | "length": 238466, | ||||
"filename": "8sync-0.2.0.tar.gz", | "filename": "8sync-0.2.0.tar.gz", | ||||
"version": "0.2.0", | "version": "0.2.0", | ||||
} | } | ||||
] | ] | ||||
# Here the loader defines the id_keys to use for existence in the snapshot | # Here the loader defines the id_keys to use for existence in the snapshot | ||||
# It's not the default archive loader which | # It's not the default archive loader which | ||||
loader = ArchiveLoader( | loader = ArchiveLoader( | ||||
url, artifacts=artifacts, identity_artifact_keys=["sha256", "length", "url"] | swh_storage, | ||||
url, | |||||
artifacts=artifacts, | |||||
identity_artifact_keys=["sha256", "length", "url"], | |||||
) | ) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status["status"] == "eventful" | assert actual_load_status["status"] == "eventful" | ||||
assert actual_load_status["snapshot_id"] is not None | assert actual_load_status["snapshot_id"] is not None | ||||
assert_last_visit_matches(loader.storage, url, status="full", type="tar") | assert_last_visit_matches(swh_storage, url, status="full", type="tar") | ||||
actual_load_status2 = loader.load() | actual_load_status2 = loader.load() | ||||
assert actual_load_status2["status"] == "uneventful" | assert actual_load_status2["status"] == "uneventful" | ||||
assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] | assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] | ||||
assert_last_visit_matches(loader.storage, url, status="full", type="tar") | assert_last_visit_matches(swh_storage, url, status="full", type="tar") | ||||
urls = [ | urls = [ | ||||
m.url | m.url | ||||
for m in requests_mock_datadir.request_history | for m in requests_mock_datadir.request_history | ||||
if m.url.startswith("https://ftp.gnu.org") | if m.url.startswith("https://ftp.gnu.org") | ||||
] | ] | ||||
assert len(urls) == 1 | assert len(urls) == 1 | ||||
def test_artifact_identity(): | def test_archive_artifact_identity(): | ||||
"""Compute primary key should return the right identity | """Compute primary key should return the right identity | ||||
""" | """ | ||||
@attr.s | @attr.s | ||||
class TestPackageInfo(ArchivePackageInfo): | class TestPackageInfo(ArchivePackageInfo): | ||||
a = attr.ib() | a = attr.ib() | ||||
b = attr.ib() | b = attr.ib() | ||||
Show All 23 Lines |