Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/jar/tests/test_jar.py
- This file was added.
# Copyright (C) 2019-2021 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
# import hashlib | |||||
# import string | |||||
# from io import BytesIO | |||||
from pathlib import Path | |||||
# import attr | |||||
import pytest | |||||
from swh.loader.package.jar.loader import JarLoader | |||||
# , JarPackageInfo | |||||
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats | |||||
from swh.model.hashutil import hash_to_bytes | |||||
from swh.model.model import Snapshot, SnapshotBranch, TargetType | |||||
# from requests.exceptions import ContentDecodingError | |||||
# import string | |||||
URL = "https://repo1.maven.org/maven2/" | |||||
MVN_ARTIFACTS = [ | |||||
{ | |||||
"time": 1626109619335, | |||||
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/" | |||||
+ "sprova4j-0.1.0.jar", | |||||
"gid": "al.aldi", | |||||
"aid": "sprova4j", | |||||
"filename": "sprova4j-0.1.0.jar", | |||||
"version": "0.1.0", | |||||
}, | |||||
{ | |||||
"time": 1626111425534, | |||||
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.1/" | |||||
+ "sprova4j-0.1.1.jar", | |||||
"gid": "al.aldi", | |||||
"aid": "sprova4j", | |||||
"filename": "sprova4j-0.1.1.jar", | |||||
"version": "0.1.1", | |||||
}, | |||||
] | |||||
_expected_new_contents_first_visit = [ | |||||
"e9258d81faf5881a2f96a77ba609396f82cb97ad", | |||||
"1572607d456d7f633bc6065a2b3048496d679a31", | |||||
] | |||||
_expected_new_directories_first_visit = [ | |||||
"daabc65ec75d487b1335ffc101c0ac11c803f8fc", | |||||
"3aebc29ed1fccc4a6f2f2010fb8e57882406b528", | |||||
] | |||||
_expected_new_revisions_first_visit = { | |||||
"44183488c0774ce3c957fa19ba695cf18a4a42b3": ( | |||||
"3aebc29ed1fccc4a6f2f2010fb8e57882406b528" | |||||
) | |||||
} | |||||
@pytest.fixture | |||||
def data_jar_1(datadir): | |||||
content = Path( | |||||
datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar" | |||||
).read_bytes() | |||||
return content | |||||
def test_jar_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): | |||||
url = URL | |||||
unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" | |||||
loader = JarLoader( | |||||
swh_storage, | |||||
url, | |||||
artifacts=[ | |||||
{ | |||||
"time": 1626111944729, | |||||
"url": unknown_artifact_url, # unknown artifact | |||||
"filename": "8sync-0.1.0.tar.gz", | |||||
"gid": "al/aldi", | |||||
"aid": "sprova4j", | |||||
"version": "0.1.0", | |||||
} | |||||
], | |||||
) | |||||
actual_load_status = loader.load() | |||||
assert actual_load_status["status"] == "uneventful" | |||||
assert actual_load_status["snapshot_id"] is not None | |||||
stats = get_stats(swh_storage) | |||||
assert { | |||||
"content": 0, | |||||
"directory": 0, | |||||
"origin": 1, | |||||
"origin_visit": 1, | |||||
"release": 0, | |||||
"revision": 0, | |||||
"skipped_content": 0, | |||||
"snapshot": 1, | |||||
} == stats | |||||
assert_last_visit_matches(swh_storage, url, status="partial", type="jar") | |||||
def test_jar_visit_with_release_artifact_no_prior_visit( | |||||
swh_storage, requests_mock, data_jar_1 | |||||
): | |||||
"""With no prior visit, loading a jar ends up with 1 snapshot | |||||
""" | |||||
requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) | |||||
loader = JarLoader( | |||||
swh_storage, MVN_ARTIFACTS[0]["url"], artifacts=[MVN_ARTIFACTS[0]] | |||||
) | |||||
actual_load_status = loader.load() | |||||
print(f"LOAD STATUS {actual_load_status.items()}") | |||||
print(f"LOAD STATUS {actual_load_status.keys()}") | |||||
# assert actual_load_status["status"] == "eventful" | |||||
expected_snapshot_first_visit_id = hash_to_bytes( | |||||
"1a8893e6a86f444e8be8e7bda6cb34fb1735a00e" | |||||
) | |||||
assert ( | |||||
hash_to_bytes(actual_load_status["snapshot_id"]) | |||||
== expected_snapshot_first_visit_id | |||||
) | |||||
print(f"Debug actual load status {actual_load_status}") | |||||
stats = get_stats(swh_storage) | |||||
print(f"Debug stats {stats}") | |||||
assert_last_visit_matches(swh_storage, URL, status="full", type="jar") | |||||
assert { | |||||
"content": len(_expected_new_contents_first_visit), | |||||
"directory": len(_expected_new_directories_first_visit), | |||||
"origin": 1, | |||||
"origin_visit": 1, | |||||
"release": 0, | |||||
"revision": len(_expected_new_revisions_first_visit), | |||||
"skipped_content": 0, | |||||
"snapshot": 1, | |||||
} == stats | |||||
expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) | |||||
assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] | |||||
expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) | |||||
assert list(swh_storage.directory_missing(expected_dirs)) == [] | |||||
expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) | |||||
assert list(swh_storage.revision_missing(expected_revs)) == [] | |||||
expected_snapshot = Snapshot( | |||||
id=expected_snapshot_first_visit_id, | |||||
branches={ | |||||
b"HEAD": SnapshotBranch( | |||||
target_type=TargetType.ALIAS, target=b"releases/0.1.0", | |||||
), | |||||
b"releases/0.1.0": SnapshotBranch( | |||||
target_type=TargetType.REVISION, | |||||
target=hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3"), | |||||
), | |||||
}, | |||||
) | |||||
check_snapshot(expected_snapshot, swh_storage) | |||||
def test_jar_2_visits_without_change(swh_storage, requests_mock_datadir): | |||||
"""With no prior visit, load a gnu project ends up with 1 snapshot | |||||
""" | |||||
url = URL | |||||
loader = JarLoader(swh_storage, url, artifacts=MVN_ARTIFACTS[1]) | |||||
actual_load_status = loader.load() | |||||
assert actual_load_status["status"] == "eventful" | |||||
assert actual_load_status["snapshot_id"] is not None | |||||
assert_last_visit_matches(swh_storage, url, status="full", type="tar") | |||||
actual_load_status2 = loader.load() | |||||
assert actual_load_status2["status"] == "uneventful" | |||||
assert actual_load_status2["snapshot_id"] is not None | |||||
assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] | |||||
assert_last_visit_matches(swh_storage, url, status="full", type="tar") | |||||
urls = [ | |||||
m.url | |||||
for m in requests_mock_datadir.request_history | |||||
if m.url.startswith("https://ftp.gnu.org") | |||||
] | |||||
assert len(urls) == 1 | |||||
# def test_jar_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): | |||||
# """With no prior visit, load a gnu project ends up with 1 snapshot | |||||
# """ | |||||
# url = URL | |||||
# artifact1 = GNU_ARTIFACTS[0] | |||||
# loader = JarLoader(swh_storage, url, [artifact1]) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, url, status="full", type="tar") | |||||
# stats = get_stats(swh_storage) | |||||
# assert { | |||||
# "content": len(_expected_new_contents_first_visit), | |||||
# "directory": len(_expected_new_directories_first_visit), | |||||
# "origin": 1, | |||||
# "origin_visit": 1, | |||||
# "release": 0, | |||||
# "revision": len(_expected_new_revisions_first_visit), | |||||
# "skipped_content": 0, | |||||
# "snapshot": 1, | |||||
# } == stats | |||||
# urls = [ | |||||
# m.url | |||||
# for m in requests_mock_datadir.request_history | |||||
# if m.url.startswith("https://ftp.gnu.org") | |||||
# ] | |||||
# assert len(urls) == 1 | |||||
# artifact2 = GNU_ARTIFACTS[1] | |||||
# loader2 = JarLoader(swh_storage, url, [artifact1, artifact2]) | |||||
# stats2 = get_stats(swh_storage) | |||||
# assert stats == stats2 # ensure we share the storage | |||||
# actual_load_status2 = loader2.load() | |||||
# assert actual_load_status2["status"] == "eventful" | |||||
# assert actual_load_status2["snapshot_id"] is not None | |||||
# stats2 = get_stats(swh_storage) | |||||
# assert { | |||||
# "content": len(_expected_new_contents_first_visit) + 14, | |||||
# "directory": len(_expected_new_directories_first_visit) + 8, | |||||
# "origin": 1, | |||||
# "origin_visit": 1 + 1, | |||||
# "release": 0, | |||||
# "revision": len(_expected_new_revisions_first_visit) + 1, | |||||
# "skipped_content": 0, | |||||
# "snapshot": 1 + 1, | |||||
# } == stats2 | |||||
# assert_last_visit_matches(swh_storage, url, status="full", type="tar") | |||||
# urls = [ | |||||
# m.url | |||||
# for m in requests_mock_datadir.request_history | |||||
# if m.url.startswith("https://ftp.gnu.org") | |||||
# ] | |||||
# # 1 artifact (2nd time no modification) + 1 new artifact | |||||
# assert len(urls) == 2 | |||||
# def test_jar_2_visits_without_change_not_gnu(swh_storage, requests_mock_datadir): | |||||
# """Load a project jar (not gnu) ends up with 1 snapshot | |||||
# """ | |||||
# url = "https://something.else.org/8sync/" | |||||
# artifacts = [ # this is not a gnu artifact | |||||
# { | |||||
# "time": "1999-12-09T09:53:30+00:00", # it's also not a timestamp | |||||
# "sha256": "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4", # noqa | |||||
# # keep a gnu artifact reference to avoid adding other test files | |||||
# "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", | |||||
# "length": 238466, | |||||
# "filename": "8sync-0.2.0.tar.gz", | |||||
# "version": "0.2.0", | |||||
# } | |||||
# ] | |||||
# # Here the loader defines the id_keys to use for existence in the snapshot | |||||
# # It's not the default jar loader which | |||||
# loader = JarLoader( | |||||
# swh_storage, | |||||
# url, | |||||
# artifacts=artifacts, | |||||
# extid_manifest_format="$sha256 $length $url", | |||||
# ) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, url, status="full", type="tar") | |||||
# actual_load_status2 = loader.load() | |||||
# assert actual_load_status2["status"] == "uneventful" | |||||
# assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] | |||||
# assert_last_visit_matches(swh_storage, url, status="full", type="tar") | |||||
# urls = [ | |||||
# m.url | |||||
# for m in requests_mock_datadir.request_history | |||||
# if m.url.startswith("https://ftp.gnu.org") | |||||
# ] | |||||
# assert len(urls) == 1 | |||||
# def test_jar_extid(): | |||||
# """Compute primary key should return the right identity | |||||
# """ | |||||
# @attr.s | |||||
# class TestPackageInfo(JarPackageInfo): | |||||
# a = attr.ib() | |||||
# b = attr.ib() | |||||
# metadata = MVN_ARTIFACTS[0] | |||||
# p_info = TestPackageInfo( | |||||
# raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata, | |||||
# ) | |||||
# for manifest_format, expected_manifest in [ | |||||
# (string.Template("$a $b"), b"1 2"), | |||||
# (string.Template(""), b""), | |||||
# (None, "{gid} {aid} {version}".format(**metadata).encode()), | |||||
# ]: | |||||
# actual_id = p_info.extid(manifest_format=manifest_format) | |||||
# assert actual_id == ( | |||||
# "package-manifest-sha256", | |||||
# hashlib.sha256(expected_manifest).digest(), | |||||
# ) | |||||
# with pytest.raises(KeyError): | |||||
# p_info.extid(manifest_format=string.Template("$a $unknown_key")) | |||||
# def test_jar_snapshot_append(swh_storage, requests_mock_datadir): | |||||
# # first loading with a first artifact | |||||
# artifact1 = GNU_ARTIFACTS[0] | |||||
# loader = JarLoader(swh_storage, URL, [artifact1], snapshot_append=True) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | |||||
# # check expected snapshot | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# branch_artifact1_name = f"releases/{artifact1['version']}".encode() | |||||
# assert b"HEAD" in snapshot.branches | |||||
# assert branch_artifact1_name in snapshot.branches | |||||
# assert snapshot.branches[b"HEAD"].target == branch_artifact1_name | |||||
# # second loading with a second artifact | |||||
# artifact2 = GNU_ARTIFACTS[1] | |||||
# loader = JarLoader(swh_storage, URL, [artifact2], snapshot_append=True) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | |||||
# # check expected snapshot, should contain a new branch and the | |||||
# # branch for the first artifact | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 3 | |||||
# branch_artifact2_name = f"releases/{artifact2['version']}".encode() | |||||
# assert b"HEAD" in snapshot.branches | |||||
# assert branch_artifact2_name in snapshot.branches | |||||
# assert branch_artifact1_name in snapshot.branches | |||||
# assert snapshot.branches[b"HEAD"].target == branch_artifact2_name | |||||
# def test_jar_snapshot_append_branch_override(swh_storage, requests_mock_datadir): | |||||
# # first loading for a first artifact | |||||
# artifact1 = GNU_ARTIFACTS[0] | |||||
# loader = JarLoader(swh_storage, URL, [artifact1], snapshot_append=True) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | |||||
# # check expected snapshot | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# branch_artifact1_name = f"releases/{artifact1['version']}".encode() | |||||
# assert branch_artifact1_name in snapshot.branches | |||||
# branch_target_first_visit = snapshot.branches[branch_artifact1_name].target | |||||
# # second loading for a second artifact with same version as the first one | |||||
# # but with different tarball content | |||||
# artifact2 = dict(GNU_ARTIFACTS[0]) | |||||
# artifact2["url"] = GNU_ARTIFACTS[1]["url"] | |||||
# artifact2["time"] = GNU_ARTIFACTS[1]["time"] | |||||
# artifact2["length"] = GNU_ARTIFACTS[1]["length"] | |||||
# loader = JarLoader(swh_storage, URL, [artifact2], snapshot_append=True) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | |||||
# # check expected snapshot, should contain the same branch as previously | |||||
# # but with different target | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# assert branch_artifact1_name in snapshot.branches | |||||
# branch_target_second_visit = snapshot.branches[branch_artifact1_name].target | |||||
# assert branch_target_first_visit != branch_target_second_visit | |||||
# @pytest.fixture | |||||
# def not_gzipped_tarball_bytes(datadir): | |||||
# return Path(datadir, "not_gzipped_tarball.tar.gz").read_bytes() | |||||
# def test_jar_not_gzipped_tarball( | |||||
# swh_storage, requests_mock, not_gzipped_tarball_bytes | |||||
# ): | |||||
# """Check that a tarball erroneously marked as gzip compressed can still | |||||
# be downloaded and processed. | |||||
# """ | |||||
# filename = "not_gzipped_tarball.tar.gz" | |||||
# url = f"https://example.org/ftp/{filename}" | |||||
# requests_mock.get( | |||||
# url, | |||||
# [ | |||||
# {"exc": ContentDecodingError,}, | |||||
# {"body": BytesIO(not_gzipped_tarball_bytes),}, | |||||
# ], | |||||
# ) | |||||
# loader = JarLoader( | |||||
# swh_storage, | |||||
# url, | |||||
# artifacts=[ | |||||
# { | |||||
# "time": 944729610, | |||||
# "url": url, | |||||
# "length": 221837, | |||||
# "filename": filename, | |||||
# "version": "0.1.0", | |||||
# } | |||||
# ], | |||||
# ) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# assert b"releases/0.1.0" in snapshot.branches |