Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/jar/tests/test_jar.py
- This file was added.
# Copyright (C) 2019-2021 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import hashlib | |||||
# from io import BytesIO | |||||
from pathlib import Path | |||||
# import hashlib | |||||
import string | |||||
import pytest | |||||
from swh.loader.package.jar.loader import JarLoader, JarPackageInfo | |||||
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats | |||||
from swh.model.hashutil import hash_to_bytes | |||||
from swh.model.model import Snapshot, SnapshotBranch, TargetType | |||||
# from requests.exceptions import ContentDecodingError | |||||
# import string | |||||
URL = "https://repo1.maven.org/maven2/" | |||||
MVN_ARTIFACTS = [ | |||||
{ | |||||
"time": 1626109619335, | |||||
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/" | |||||
+ "sprova4j-0.1.0.jar", | |||||
"gid": "al.aldi", | |||||
"aid": "sprova4j", | |||||
"filename": "sprova4j-0.1.0.jar", | |||||
"version": "0.1.0", | |||||
}, | |||||
{ | |||||
"time": 1626111425534, | |||||
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.1/" | |||||
+ "sprova4j-0.1.1.jar", | |||||
"gid": "al.aldi", | |||||
"aid": "sprova4j", | |||||
"filename": "sprova4j-0.1.1.jar", | |||||
"version": "0.1.1", | |||||
}, | |||||
] | |||||
_expected_new_contents_first_visit = [ | |||||
"cd807364cd7730022b3849f90ccf4bababbada84", | |||||
"79e33dd52ebdf615e6696ae69add91cb990d81e2", | |||||
"8002bd514156f05a0940ae14ef86eb0179cbd510", | |||||
"23479553a6ccec30d377dee0496123a65d23fd8c", | |||||
"07ffbebb933bc1660e448f07d8196c2b083797f9", | |||||
"abf021b581f80035b56153c9aa27195b8d7ebbb8", | |||||
"eec70ba80a6862ed2619727663b17eb0d9dfe131", | |||||
"81a493dacb44dedf623f29ecf62c0e035bf698de", | |||||
"bda85ed0bbecf8cddfea04234bee16f476f64fe4", | |||||
"1ec91d561f5bdf59acb417086e04c54ead94e94e", | |||||
"d517b423da707fa21378623f35facebff53cb59d", | |||||
"3f0f21a764972d79e583908991c893c999613354", | |||||
"a2dd4d7dfe6043baf9619081e4e29966989211af", | |||||
"f62685cf0c6825a4097c949280b584cf0e16d047", | |||||
"56afc1ea60cef6548ce0a34f44e91b0e4b063835", | |||||
"cf7c740926e7ebc9ac8978a5c4f0e1e7a0e9e3af", | |||||
"86ff828bea1c22ca3d50ed82569b9c59ce2c41a1", | |||||
"1d0fa04454d9fec31d8ee3f35b58158ca1e28b15", | |||||
"e90239a2c8d9ede61a29671a8b397a743e18fa34", | |||||
"ce8851005d084aea089bcd8cf01052f4b234a823", | |||||
"2c34ce622aa7fa68d104900840f66671718e6249", | |||||
"e6a6fec32dcb3bee93c34fc11b0174a6b0b0ec6d", | |||||
"405d3e1be4b658bf26de37f2c90c597b2796b9d7", | |||||
"d0d2f5848721e04300e537826ef7d2d6d9441df0", | |||||
"399c67e33e38c475fd724d283dd340f6a2e8dc91", | |||||
"dea10c1111cc61ac1809fb7e88857e3db054959f", | |||||
] | |||||
_expected_new_directories_first_visit = [ | |||||
"6c9de41e4cebb91a8368da1d89ae9873bd540ec3", | |||||
"c1a2ee97fc47426d0179f94d223405336b5cd075", | |||||
"9e1bdca292765a9528af18743bd793b80362c768", | |||||
"193a7af634592ef27fb341762806f61e8fb8eab3", | |||||
"a297aa21e3dbf138b370be3aae7a852dd403bbbb", | |||||
"da84026119ae04022f007d5b3362e98d46d09045", | |||||
"75bb915942a9c441ca62aeffc3b634f1ec9ce5e2", | |||||
"0851d359283b2ad82b116c8d1b55ab14b1ec219c", | |||||
"2bcbb8b723a025ee9a36b719cea229ed38c37e46", | |||||
] | |||||
_expected_new_revisions_first_visit = { | |||||
"b7cd10479d3639a813b444d2ccfbdaa0b2613690": ( | |||||
"6c9de41e4cebb91a8368da1d89ae9873bd540ec3" | |||||
) | |||||
} | |||||
@pytest.fixture | |||||
def data_jar_1(datadir): | |||||
content = Path( | |||||
datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar" | |||||
).read_bytes() | |||||
return content | |||||
@pytest.fixture | |||||
def data_jar_2(datadir): | |||||
content = Path( | |||||
datadir, "https_maven.org", "sprova4j-0.1.1-sources.jar" | |||||
).read_bytes() | |||||
return content | |||||
def test_jar_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): | |||||
unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" | |||||
loader = JarLoader( | |||||
swh_storage, | |||||
unknown_artifact_url, | |||||
artifacts=[ | |||||
{ | |||||
"time": 1626111944729, | |||||
"url": unknown_artifact_url, # unknown artifact | |||||
"filename": "8sync-0.1.0.tar.gz", | |||||
"gid": "al/aldi", | |||||
"aid": "sprova4j", | |||||
"version": "0.1.0", | |||||
} | |||||
], | |||||
) | |||||
actual_load_status = loader.load() | |||||
assert actual_load_status["status"] == "uneventful" | |||||
assert actual_load_status["snapshot_id"] is not None | |||||
stats = get_stats(swh_storage) | |||||
assert { | |||||
"content": 0, | |||||
"directory": 0, | |||||
"origin": 1, | |||||
"origin_visit": 1, | |||||
"release": 0, | |||||
"revision": 0, | |||||
"skipped_content": 0, | |||||
"snapshot": 1, | |||||
} == stats | |||||
assert_last_visit_matches( | |||||
swh_storage, unknown_artifact_url, status="partial", type="jar" | |||||
) | |||||
def test_jar_visit_with_release_artifact_no_prior_visit( | |||||
swh_storage, requests_mock, data_jar_1 | |||||
): | |||||
"""With no prior visit, loading a jar ends up with 1 snapshot | |||||
""" | |||||
requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) | |||||
loader = JarLoader( | |||||
swh_storage, MVN_ARTIFACTS[0]["url"], artifacts=[MVN_ARTIFACTS[0]] | |||||
) | |||||
actual_load_status = loader.load() | |||||
assert actual_load_status["status"] == "eventful" | |||||
expected_snapshot_first_visit_id = hash_to_bytes( | |||||
"677b0280cbe9e650c9dfa80fbb1ab7ebb1603c86" | |||||
) | |||||
expected_snapshot = Snapshot( | |||||
id=expected_snapshot_first_visit_id, | |||||
branches={ | |||||
b"HEAD": SnapshotBranch( | |||||
target_type=TargetType.ALIAS, target=b"releases/0.1.0", | |||||
), | |||||
b"releases/0.1.0": SnapshotBranch( | |||||
target_type=TargetType.REVISION, | |||||
target=hash_to_bytes("b7cd10479d3639a813b444d2ccfbdaa0b2613690"), | |||||
), | |||||
}, | |||||
) | |||||
check_snapshot(expected_snapshot, swh_storage) | |||||
print(f"Debug actual_snapshot is {actual_load_status['snapshot_id']}.") | |||||
assert ( | |||||
hash_to_bytes(actual_load_status["snapshot_id"]) | |||||
== expected_snapshot_first_visit_id | |||||
) | |||||
stats = get_stats(swh_storage) | |||||
assert_last_visit_matches( | |||||
swh_storage, MVN_ARTIFACTS[0]["url"], status="full", type="jar" | |||||
) | |||||
assert { | |||||
"content": len(_expected_new_contents_first_visit), | |||||
"directory": len(_expected_new_directories_first_visit), | |||||
"origin": 1, | |||||
"origin_visit": 1, | |||||
"release": 0, | |||||
"revision": len(_expected_new_revisions_first_visit), | |||||
"skipped_content": 0, | |||||
"snapshot": 1, | |||||
} == stats | |||||
expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) | |||||
assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] | |||||
expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) | |||||
assert list(swh_storage.directory_missing(expected_dirs)) == [] | |||||
expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) | |||||
assert list(swh_storage.revision_missing(expected_revs)) == [] | |||||
def test_jar_2_visits_without_change( | |||||
swh_storage, requests_mock_datadir, requests_mock, data_jar_2 | |||||
): | |||||
"""With no prior visit, load a gnu project ends up with 1 snapshot | |||||
""" | |||||
requests_mock.get(MVN_ARTIFACTS[1]["url"], content=data_jar_2) | |||||
loader = JarLoader( | |||||
swh_storage, MVN_ARTIFACTS[1]["url"], artifacts=[MVN_ARTIFACTS[1]] | |||||
) | |||||
actual_load_status = loader.load() | |||||
print(f"Debug test 2 actual_snapshot is {actual_load_status['snapshot_id']}.") | |||||
assert actual_load_status["status"] == "eventful" | |||||
expected_snapshot_first_visit_id = hash_to_bytes( | |||||
"5da7a3cc23230c9d1be27395a2ab483e8d01ae19" | |||||
) | |||||
assert ( | |||||
hash_to_bytes(actual_load_status["snapshot_id"]) | |||||
== expected_snapshot_first_visit_id | |||||
) | |||||
assert_last_visit_matches( | |||||
swh_storage, MVN_ARTIFACTS[1]["url"], status="full", type="jar" | |||||
) | |||||
actual_load_status2 = loader.load() | |||||
assert actual_load_status2["status"] == "uneventful" | |||||
assert actual_load_status2["snapshot_id"] is not None | |||||
assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] | |||||
assert_last_visit_matches( | |||||
swh_storage, MVN_ARTIFACTS[1]["url"], status="full", type="jar" | |||||
) | |||||
urls = [ | |||||
m.url | |||||
for m in requests_mock_datadir.request_history | |||||
if m.url.startswith("https://repo1.maven.org/maven2") | |||||
] | |||||
assert len(urls) == 1 | |||||
def test_jar_extid(): | |||||
"""Compute primary key should return the right identity | |||||
""" | |||||
metadata = MVN_ARTIFACTS[0] | |||||
p_info = JarPackageInfo(raw_info={**metadata}, **metadata) | |||||
expected_manifest = b"al.aldi sprova4j 0.1.0" | |||||
for manifest_format in [ | |||||
string.Template("$aid $gid $version"), | |||||
string.Template("$gid $aid"), | |||||
]: | |||||
actual_id = p_info.extid(manifest_format=manifest_format) | |||||
assert actual_id != ("maven-jar", hashlib.sha256(expected_manifest).digest(),) | |||||
for manifest_format, expected_manifest in [ | |||||
(None, "{gid} {aid} {version}".format(**metadata).encode()), | |||||
]: | |||||
actual_id = p_info.extid(manifest_format=manifest_format) | |||||
assert actual_id == ("maven-jar", hashlib.sha256(expected_manifest).digest(),) | |||||
with pytest.raises(KeyError): | |||||
p_info.extid(manifest_format=string.Template("$a $unknown_key")) | |||||
def test_jar_snapshot_append( | |||||
swh_storage, requests_mock_datadir, requests_mock, data_jar_1, data_jar_2 | |||||
): | |||||
# first loading with a first artifact | |||||
artifact1 = MVN_ARTIFACTS[0] | |||||
url1 = artifact1["url"] | |||||
requests_mock.get(url1, content=data_jar_1) | |||||
loader = JarLoader(swh_storage, url1, [artifact1], snapshot_append=True) | |||||
actual_load_status = loader.load() | |||||
assert actual_load_status["status"] == "eventful" | |||||
assert actual_load_status["snapshot_id"] is not None | |||||
assert_last_visit_matches(swh_storage, url1, status="full", type="jar") | |||||
# check expected snapshot | |||||
snapshot = loader.last_snapshot() | |||||
assert len(snapshot.branches) == 2 | |||||
branch_artifact1_name = f"releases/{artifact1['version']}".encode() | |||||
assert b"HEAD" in snapshot.branches | |||||
assert branch_artifact1_name in snapshot.branches | |||||
assert snapshot.branches[b"HEAD"].target == branch_artifact1_name | |||||
# second loading with a second artifact | |||||
artifact2 = MVN_ARTIFACTS[1] | |||||
url2 = artifact2["url"] | |||||
requests_mock.get(url2, content=data_jar_2) | |||||
loader = JarLoader(swh_storage, url2, [artifact2], snapshot_append=True) | |||||
actual_load_status = loader.load() | |||||
assert actual_load_status["status"] == "eventful" | |||||
assert actual_load_status["snapshot_id"] is not None | |||||
assert_last_visit_matches(swh_storage, url2, status="full", type="jar") | |||||
# check expected snapshot, should contain a new branch and the | |||||
# branch for the first artifact | |||||
snapshot = loader.last_snapshot() | |||||
assert len(snapshot.branches) == 2 | |||||
branch_artifact2_name = f"releases/{artifact2['version']}".encode() | |||||
assert b"HEAD" in snapshot.branches | |||||
assert branch_artifact2_name in snapshot.branches | |||||
assert branch_artifact1_name not in snapshot.branches | |||||
assert snapshot.branches[b"HEAD"].target == branch_artifact2_name | |||||
# def test_jar_snapshot_append_branch_override(swh_storage, requests_mock_datadir): | |||||
# # first loading for a first artifact | |||||
# artifact1 = GNU_ARTIFACTS[0] | |||||
# loader = JarLoader(swh_storage, URL, [artifact1], snapshot_append=True) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | |||||
# # check expected snapshot | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# branch_artifact1_name = f"releases/{artifact1['version']}".encode() | |||||
# assert branch_artifact1_name in snapshot.branches | |||||
# branch_target_first_visit = snapshot.branches[branch_artifact1_name].target | |||||
# # second loading for a second artifact with same version as the first one | |||||
# # but with different tarball content | |||||
# artifact2 = dict(GNU_ARTIFACTS[0]) | |||||
# artifact2["url"] = GNU_ARTIFACTS[1]["url"] | |||||
# artifact2["time"] = GNU_ARTIFACTS[1]["time"] | |||||
# artifact2["length"] = GNU_ARTIFACTS[1]["length"] | |||||
# loader = JarLoader(swh_storage, URL, [artifact2], snapshot_append=True) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# assert_last_visit_matches(swh_storage, URL, status="full", type="tar") | |||||
# # check expected snapshot, should contain the same branch as previously | |||||
# # but with different target | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# assert branch_artifact1_name in snapshot.branches | |||||
# branch_target_second_visit = snapshot.branches[branch_artifact1_name].target | |||||
# assert branch_target_first_visit != branch_target_second_visit | |||||
# @pytest.fixture | |||||
# def not_gzipped_tarball_bytes(datadir): | |||||
# return Path(datadir, "not_gzipped_tarball.tar.gz").read_bytes() | |||||
# def test_jar_not_gzipped_tarball( | |||||
# swh_storage, requests_mock, not_gzipped_tarball_bytes | |||||
# ): | |||||
# """Check that a tarball erroneously marked as gzip compressed can still | |||||
# be downloaded and processed. | |||||
# """ | |||||
# filename = "not_gzipped_tarball.tar.gz" | |||||
# url = f"https://example.org/ftp/{filename}" | |||||
# requests_mock.get( | |||||
# url, | |||||
# [ | |||||
# {"exc": ContentDecodingError,}, | |||||
# {"body": BytesIO(not_gzipped_tarball_bytes),}, | |||||
# ], | |||||
# ) | |||||
# loader = JarLoader( | |||||
# swh_storage, | |||||
# url, | |||||
# artifacts=[ | |||||
# { | |||||
# "time": 944729610, | |||||
# "url": url, | |||||
# "length": 221837, | |||||
# "filename": filename, | |||||
# "version": "0.1.0", | |||||
# } | |||||
# ], | |||||
# ) | |||||
# actual_load_status = loader.load() | |||||
# assert actual_load_status["status"] == "eventful" | |||||
# assert actual_load_status["snapshot_id"] is not None | |||||
# snapshot = loader.last_snapshot() | |||||
# assert len(snapshot.branches) == 2 | |||||
# assert b"releases/0.1.0" in snapshot.branches |