diff --git a/swh/loader/package/maven/loader.py b/swh/loader/package/maven/loader.py index d664e67..e2d9d05 100644 --- a/swh/loader/package/maven/loader.py +++ b/swh/loader/package/maven/loader.py @@ -1,204 +1,206 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from __future__ import annotations + from datetime import datetime, timezone import json import logging from os import path import string from typing import Any, Iterator, List, Optional, Sequence, Tuple import attr import iso8601 import requests from typing_extensions import TypedDict from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import EMPTY_AUTHOR, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, RawExtrinsicMetadata, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) class ArtifactDict(TypedDict): """Data about a Maven artifact, passed by the Maven Lister.""" time: str """the time of the last update of jar file on the server as an iso8601 date string """ url: str """the artifact url to retrieve filename""" filename: Optional[str] """optionally, the file's name""" gid: str """artifact's groupId""" aid: str """artifact's artifactId""" version: str """artifact's version""" base_url: str """root URL of the Maven instance""" @attr.s class MavenPackageInfo(BasePackageInfo): time = attr.ib(type=datetime) """Timestamp of the last update of jar file on the server.""" gid = attr.ib(type=str) """Group ID of the maven artifact""" aid = attr.ib(type=str) """Artifact ID of the maven artifact""" version = attr.ib(type=str) """Version of the maven artifact""" base_url = attr.ib(type=str) """Root URL of the Maven instance""" # default format for maven artifacts MANIFEST_FORMAT = string.Template("$gid $aid $version $url $time") EXTID_TYPE = "maven-jar" EXTID_VERSION = 0 @classmethod - def from_metadata(cls, url: str, a_metadata: ArtifactDict) -> "MavenPackageInfo": + def from_metadata(cls, a_metadata: ArtifactDict) -> MavenPackageInfo: time = iso8601.parse_date(a_metadata["time"]).astimezone(tz=timezone.utc) + url = a_metadata["url"] return cls( url=url, filename=a_metadata.get("filename") or path.split(url)[-1], time=time, gid=a_metadata["gid"], aid=a_metadata["aid"], version=a_metadata["version"], base_url=a_metadata["base_url"], directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="maven-json", metadata=json.dumps(a_metadata).encode(), ), ], ) class MavenLoader(PackageLoader[MavenPackageInfo]): """Load source code jar origin's artifact files into swh archive""" visit_type = "maven" def __init__( self, storage: StorageInterface, url: str, artifacts: Sequence[ArtifactDict], **kwargs: Any, ): """Loader constructor. For now, this is the lister's task output. There is one, and only one, artefact (jar or zip) per version, as guaranteed by the Maven coordinates system. Args: url: Origin url artifacts: List of single artifact information """ super().__init__(storage=storage, url=url, **kwargs) self.artifacts = artifacts # assume order is enforced in the lister self.version_artifact = { jar["version"]: jar for jar in artifacts if jar["version"] } if artifacts: base_urls = {jar["base_url"] for jar in artifacts} try: (self.base_url,) = base_urls except ValueError: raise ValueError( "Artifacts originate from more than one Maven instance: " + ", ".join(base_urls) ) from None else: # There is no artifact, so self.metadata_authority won't be called, # so self.base_url won't be accessed. pass def get_versions(self) -> Sequence[str]: return list(self.version_artifact) def get_default_version(self) -> str: # Default version is the last item return self.artifacts[-1]["version"] def get_metadata_authority(self): return MetadataAuthority(type=MetadataAuthorityType.FORGE, url=self.base_url) def build_extrinsic_directory_metadata( self, p_info: MavenPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> List[RawExtrinsicMetadata]: # Rebuild POM URL. pom_url = path.dirname(p_info.url) pom_url = f"{pom_url}/{p_info.aid}-{p_info.version}.pom" r = requests.get(pom_url, allow_redirects=True) if r.status_code == 200: metadata_pom = r.content else: metadata_pom = b"" p_info.directory_extrinsic_metadata.append( RawExtrinsicMetadataCore( format="maven-pom", metadata=metadata_pom, ) ) return super().build_extrinsic_directory_metadata( p_info=p_info, release_id=release_id, directory_id=directory_id, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, MavenPackageInfo]]: a_metadata = self.version_artifact[version] - yield release_name(a_metadata["version"]), MavenPackageInfo.from_metadata( - self.origin.url, a_metadata - ) + rel_name = release_name(a_metadata["version"]) + yield rel_name, MavenPackageInfo.from_metadata(a_metadata) def build_release( self, p_info: MavenPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: msg = f"Synthetic release for archive at {p_info.url}\n".encode("utf-8") normalized_time = TimestampWithTimezone.from_datetime(p_info.time) return Release( name=p_info.version.encode(), message=msg, date=normalized_time, author=EMPTY_AUTHOR, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) diff --git a/swh/loader/package/maven/tests/test_maven.py b/swh/loader/package/maven/tests/test_maven.py index 96b6ad6..36de2a7 100644 --- a/swh/loader/package/maven/tests/test_maven.py +++ b/swh/loader/package/maven/tests/test_maven.py @@ -1,618 +1,475 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib +from itertools import chain import json +import os from pathlib import Path import pytest +from swh.core.tarball import uncompress from swh.loader.package import __version__ from swh.loader.package.maven.loader import MavenLoader, MavenPackageInfo from swh.loader.package.utils import EMPTY_AUTHOR from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.from_disk import Directory, iter_directory from swh.model.hashutil import hash_to_bytes from swh.model.model import ( RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.algos.snapshot import snapshot_get_all_branches REPO_BASE_URL = "https://repo1.maven.org/maven2/" -MVN_ARTIFACT_URLS = [ - f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", - f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", -] +MVN_ORIGIN_URL = f"{REPO_BASE_URL}al/aldi/sprova4j" MVN_ARTIFACTS = [ { "time": "2021-07-12 19:06:59.335000", "gid": "al.aldi", "aid": "sprova4j", "filename": "sprova4j-0.1.0-sources.jar", "version": "0.1.0", "base_url": REPO_BASE_URL, + "url": f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", }, { "time": "2021-07-12 19:37:05.534000", "gid": "al.aldi", "aid": "sprova4j", "filename": "sprova4j-0.1.1-sources.jar", "version": "0.1.1", "base_url": REPO_BASE_URL, + "url": f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", }, ] MVN_ARTIFACTS_POM = [ f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom", f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom", ] -_expected_new_contents_first_visit = [ - "cd807364cd7730022b3849f90ccf4bababbada84", - "79e33dd52ebdf615e6696ae69add91cb990d81e2", - "8002bd514156f05a0940ae14ef86eb0179cbd510", - "23479553a6ccec30d377dee0496123a65d23fd8c", - "07ffbebb933bc1660e448f07d8196c2b083797f9", - "abf021b581f80035b56153c9aa27195b8d7ebbb8", - "eec70ba80a6862ed2619727663b17eb0d9dfe131", - "81a493dacb44dedf623f29ecf62c0e035bf698de", - "bda85ed0bbecf8cddfea04234bee16f476f64fe4", - "1ec91d561f5bdf59acb417086e04c54ead94e94e", - "d517b423da707fa21378623f35facebff53cb59d", - "3f0f21a764972d79e583908991c893c999613354", - "a2dd4d7dfe6043baf9619081e4e29966989211af", - "f62685cf0c6825a4097c949280b584cf0e16d047", - "56afc1ea60cef6548ce0a34f44e91b0e4b063835", - "cf7c740926e7ebc9ac8978a5c4f0e1e7a0e9e3af", - "86ff828bea1c22ca3d50ed82569b9c59ce2c41a1", - "1d0fa04454d9fec31d8ee3f35b58158ca1e28b15", - "e90239a2c8d9ede61a29671a8b397a743e18fa34", - "ce8851005d084aea089bcd8cf01052f4b234a823", - "2c34ce622aa7fa68d104900840f66671718e6249", - "e6a6fec32dcb3bee93c34fc11b0174a6b0b0ec6d", - "405d3e1be4b658bf26de37f2c90c597b2796b9d7", - "d0d2f5848721e04300e537826ef7d2d6d9441df0", - "399c67e33e38c475fd724d283dd340f6a2e8dc91", - "dea10c1111cc61ac1809fb7e88857e3db054959f", -] - -_expected_json_metadata = { - "time": "2021-07-12 19:06:59.335000", - "gid": "al.aldi", - "aid": "sprova4j", - "filename": "sprova4j-0.1.0-sources.jar", - "version": "0.1.0", - "base_url": REPO_BASE_URL, -} -_expected_pom_metadata = ( - """ - - 4.0.0 - al.aldi - sprova4j - 0.1.0 - sprova4j - Java client for Sprova Test Management - https://github.com/aldialimucaj/sprova4j - 2018 - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - aldi - Aldi Alimucaj - aldi.alimucaj@gmail.com - - - - scm:git:git://github.com/aldialimucaj/sprova4j.git - scm:git:git://github.com/aldialimucaj/sprova4j.git - https://github.com/aldialimucaj/sprova4j - - - - ch.qos.logback - logback-classic - 1.2.3 - runtime - - - com.google.code.gson - gson - 2.8.3 - runtime - - - com.squareup.okhttp3 - okhttp - 3.10.0 - runtime - - - com.squareup.okio - okio - 1.0.0 - runtime - - - org.glassfish - javax.json - 1.1.2 - runtime - - - javax.json - javax.json-api - 1.1.2 - runtime - - - javax.validation - validation-api - 2.0.1.Final - runtime - - - junit - junit - 4.12 - test - - - com.squareup.okhttp3 - mockwebserver - 3.10.0 - test - - - -""" -) - -_expected_new_directories_first_visit = [ - "6c9de41e4cebb91a8368da1d89ae9873bd540ec3", - "c1a2ee97fc47426d0179f94d223405336b5cd075", - "9e1bdca292765a9528af18743bd793b80362c768", - "193a7af634592ef27fb341762806f61e8fb8eab3", - "a297aa21e3dbf138b370be3aae7a852dd403bbbb", - "da84026119ae04022f007d5b3362e98d46d09045", - "75bb915942a9c441ca62aeffc3b634f1ec9ce5e2", - "0851d359283b2ad82b116c8d1b55ab14b1ec219c", - "2bcbb8b723a025ee9a36b719cea229ed38c37e46", -] - -_expected_new_release_first_visit = "02e83c29ec094db581f939d2e238d0613a4f59ac" - -REL_MSG = ( +REL_MSGS = ( b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/" - b"sprova4j/0.1.0/sprova4j-0.1.0-sources.jar\n" + b"sprova4j/0.1.0/sprova4j-0.1.0-sources.jar\n", + b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/" + b"sprova4j/0.1.1/sprova4j-0.1.1-sources.jar\n", ) -REVISION_DATE = TimestampWithTimezone.from_datetime( - datetime.datetime(2021, 7, 12, 19, 6, 59, 335000, tzinfo=datetime.timezone.utc) +REL_DATES = ( + TimestampWithTimezone.from_datetime( + datetime.datetime(2021, 7, 12, 19, 6, 59, 335000, tzinfo=datetime.timezone.utc) + ), + TimestampWithTimezone.from_datetime( + datetime.datetime(2021, 7, 12, 19, 37, 5, 534000, tzinfo=datetime.timezone.utc) + ), ) @pytest.fixture def data_jar_1(datadir): content = Path( datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar" ).read_bytes() return content @pytest.fixture def data_pom_1(datadir): content = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() return content @pytest.fixture def data_jar_2(datadir): content = Path( datadir, "https_maven.org", "sprova4j-0.1.1-sources.jar" ).read_bytes() return content @pytest.fixture def data_pom_2(datadir): content = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() return content -def test_jar_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): +@pytest.fixture +def jar_dirs(datadir, tmp_path): + jar_1_path = os.path.join(datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar") + jar_2_path = os.path.join(datadir, "https_maven.org", "sprova4j-0.1.1-sources.jar") + + jar_1_extract_path = os.path.join(tmp_path, "jar_1") + jar_2_extract_path = os.path.join(tmp_path, "jar_2") + + uncompress(jar_1_path, jar_1_extract_path) + uncompress(jar_2_path, jar_2_extract_path) + + jar_1_dir = Directory.from_disk(path=jar_1_extract_path.encode()) + jar_2_dir = Directory.from_disk(path=jar_2_extract_path.encode()) + + return [jar_1_dir, jar_2_dir] + + +@pytest.fixture +def expected_contents_and_directories(jar_dirs): + jar_1_cnts, _, jar_1_dirs = iter_directory(jar_dirs[0]) + jar_2_cnts, _, jar_2_dirs = iter_directory(jar_dirs[1]) + + contents = {cnt.sha1 for cnt in chain(jar_1_cnts, jar_2_cnts)} + directories = {dir.id for dir in chain(jar_1_dirs, jar_2_dirs)} + + return contents, directories + + +@pytest.fixture +def expected_releases(jar_dirs): + return [ + Release( + name=b"0.1.0", + message=REL_MSGS[0], + author=EMPTY_AUTHOR, + date=REL_DATES[0], + target_type=ModelObjectType.DIRECTORY, + target=jar_dirs[0].hash, + synthetic=True, + metadata=None, + ), + Release( + name=b"0.1.1", + message=REL_MSGS[1], + author=EMPTY_AUTHOR, + date=REL_DATES[1], + target_type=ModelObjectType.DIRECTORY, + target=jar_dirs[1].hash, + synthetic=True, + metadata=None, + ), + ] + + +@pytest.fixture +def expected_snapshot(expected_releases): + return Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target_type=TargetType.ALIAS, + target=b"releases/0.1.1", + ), + b"releases/0.1.0": SnapshotBranch( + target_type=TargetType.RELEASE, + target=expected_releases[0].id, + ), + b"releases/0.1.1": SnapshotBranch( + target_type=TargetType.RELEASE, + target=expected_releases[1].id, + ), + }, + ) + + +@pytest.fixture +def expected_json_metadata(): + return MVN_ARTIFACTS + + +@pytest.fixture +def expected_pom_metadata(data_pom_1, data_pom_2): + return [data_pom_1, data_pom_2] + + +@pytest.fixture(autouse=True) +def network_requests_mock( + requests_mock, + data_jar_1, + data_pom_1, + data_jar_2, + data_pom_2, +): + requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) + requests_mock.get(MVN_ARTIFACTS[1]["url"], content=data_jar_2) + requests_mock.get(MVN_ARTIFACTS_POM[1], content=data_pom_2) + + +def test_maven_loader_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): + origin_url = "https://ftp.g.o/unknown" unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" loader = MavenLoader( swh_storage, - unknown_artifact_url, + origin_url, artifacts=[ { "time": "2021-07-18 08:05:05.187000", "url": unknown_artifact_url, # unknown artifact "filename": "8sync-0.1.0.tar.gz", "gid": "al/aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": "https://repo1.maven.org/maven2/", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None expected_snapshot_id = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e" assert actual_load_status["snapshot_id"] == expected_snapshot_id stats = get_stats(swh_storage) - assert_last_visit_matches( - swh_storage, unknown_artifact_url, status="partial", type="maven" - ) + assert_last_visit_matches(swh_storage, origin_url, status="partial", type="maven") assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats -def test_jar_visit_inconsistent_base_url( +def test_maven_loader_jar_visit_inconsistent_base_url( swh_storage, requests_mock, data_jar_1, data_pom_1 ): """With no prior visit, loading a jar ends up with 1 snapshot""" with pytest.raises(ValueError, match="more than one Maven instance"): MavenLoader( swh_storage, - MVN_ARTIFACT_URLS[0], + MVN_ORIGIN_URL, artifacts=[ MVN_ARTIFACTS[0], {**MVN_ARTIFACTS[1], "base_url": "http://maven.example/"}, ], ) -def test_jar_visit_with_release_artifact_no_prior_visit( - swh_storage, requests_mock, data_jar_1, data_pom_1 +def test_maven_loader_first_visit( + swh_storage, expected_contents_and_directories, expected_snapshot, expected_releases ): """With no prior visit, loading a jar ends up with 1 snapshot""" - requests_mock.get(MVN_ARTIFACT_URLS[0], content=data_jar_1) - requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) - loader = MavenLoader( - swh_storage, MVN_ARTIFACT_URLS[0], artifacts=[MVN_ARTIFACTS[0]] - ) + + loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" - expected_snapshot_first_visit_id = hash_to_bytes( - "c5195b8ebd148649bf094561877964b131ab27e0" - ) - - expected_snapshot = Snapshot( - id=expected_snapshot_first_visit_id, - branches={ - b"HEAD": SnapshotBranch( - target_type=TargetType.ALIAS, - target=b"releases/0.1.0", - ), - b"releases/0.1.0": SnapshotBranch( - target_type=TargetType.RELEASE, - target=hash_to_bytes(_expected_new_release_first_visit), - ), - }, - ) actual_snapshot = snapshot_get_all_branches( swh_storage, hash_to_bytes(actual_load_status["snapshot_id"]) ) - assert actual_snapshot == expected_snapshot + assert actual_load_status["snapshot_id"] == expected_snapshot.id.hex() check_snapshot(expected_snapshot, swh_storage) - assert ( - hash_to_bytes(actual_load_status["snapshot_id"]) - == expected_snapshot_first_visit_id - ) - stats = get_stats(swh_storage) - assert_last_visit_matches( - swh_storage, MVN_ARTIFACT_URLS[0], status="full", type="maven" - ) + assert_last_visit_matches(swh_storage, MVN_ORIGIN_URL, status="full", type="maven") - expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) + expected_contents, expected_directories = expected_contents_and_directories assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] - - expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) - assert list(swh_storage.directory_missing(expected_dirs)) == [] - - expected_rels = map(hash_to_bytes, {_expected_new_release_first_visit}) - assert list(swh_storage.release_missing(expected_rels)) == [] + assert list(swh_storage.directory_missing(expected_directories)) == [] rel_id = actual_snapshot.branches[b"releases/0.1.0"].target - (rel,) = swh_storage.release_get([rel_id]) - - assert rel == Release( - id=hash_to_bytes(_expected_new_release_first_visit), - name=b"0.1.0", - message=REL_MSG, - author=EMPTY_AUTHOR, - date=REVISION_DATE, - target_type=ModelObjectType.DIRECTORY, - target=hash_to_bytes("6c9de41e4cebb91a8368da1d89ae9873bd540ec3"), - synthetic=True, - metadata=None, - ) + rel2_id = actual_snapshot.branches[b"releases/0.1.1"].target + releases = swh_storage.release_get([rel_id, rel2_id]) + + assert releases == expected_releases assert { - "content": len(_expected_new_contents_first_visit), - "directory": len(_expected_new_directories_first_visit), + "content": len(expected_contents), + "directory": len(expected_directories), "origin": 1, "origin_visit": 1, - "release": 1, + "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats -def test_jar_2_visits_without_change( - swh_storage, requests_mock_datadir, requests_mock, data_jar_2, data_pom_2 +def test_maven_loader_2_visits_without_change( + swh_storage, requests_mock, expected_snapshot ): - """With no prior visit, load a gnu project ends up with 1 snapshot""" - requests_mock.get(MVN_ARTIFACT_URLS[1], content=data_jar_2) - requests_mock.get(MVN_ARTIFACTS_POM[1], content=data_pom_2) - loader = MavenLoader( - swh_storage, MVN_ARTIFACT_URLS[1], artifacts=[MVN_ARTIFACTS[1]] - ) + """With no prior visit, load a maven project ends up with 1 snapshot""" + + loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" - expected_snapshot_first_visit_id = hash_to_bytes( - "91dcacee7a6d2b54f9cab14bc14cb86d22d2ac2b" - ) - - assert ( - hash_to_bytes(actual_load_status["snapshot_id"]) - == expected_snapshot_first_visit_id - ) + assert actual_load_status["snapshot_id"] == expected_snapshot.id.hex() - assert_last_visit_matches( - swh_storage, MVN_ARTIFACT_URLS[1], status="full", type="maven" - ) + assert_last_visit_matches(swh_storage, MVN_ORIGIN_URL, status="full", type="maven") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] is not None assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] - assert_last_visit_matches( - swh_storage, MVN_ARTIFACT_URLS[1], status="full", type="maven" - ) + assert_last_visit_matches(swh_storage, MVN_ORIGIN_URL, status="full", type="maven") # Make sure we have only one entry in history for the pom fetch, one for # the actual download of jar, and that they're correct. - urls_history = [str(req.url) for req in list(requests_mock_datadir.request_history)] + urls_history = [str(req.url) for req in list(requests_mock.request_history)] assert urls_history == [ - MVN_ARTIFACT_URLS[1], + MVN_ARTIFACTS[0]["url"], + MVN_ARTIFACTS_POM[0], + MVN_ARTIFACTS[1]["url"], MVN_ARTIFACTS_POM[1], ] -def test_metadata(swh_storage, requests_mock, data_jar_1, data_pom_1): +def test_maven_loader_extrinsic_metadata( + swh_storage, expected_releases, expected_json_metadata, expected_pom_metadata +): """With no prior visit, loading a jar ends up with 1 snapshot. Extrinsic metadata is the pom file associated to the source jar. """ - requests_mock.get(MVN_ARTIFACT_URLS[0], content=data_jar_1) - requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) - loader = MavenLoader( - swh_storage, MVN_ARTIFACT_URLS[0], artifacts=[MVN_ARTIFACTS[0]] - ) + loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" - expected_release_id = hash_to_bytes(_expected_new_release_first_visit) - release = swh_storage.release_get([expected_release_id])[0] - assert release is not None + for i, expected_release in enumerate(expected_releases): - release_swhid = CoreSWHID( - object_type=ObjectType.RELEASE, object_id=expected_release_id - ) - directory_swhid = ExtendedSWHID( - object_type=ExtendedObjectType.DIRECTORY, object_id=release.target - ) - metadata_authority = MetadataAuthority( - type=MetadataAuthorityType.FORGE, - url=REPO_BASE_URL, - ) + expected_release_id = expected_release.id + release = swh_storage.release_get([expected_release_id])[0] + assert release is not None - expected_metadata = [ - RawExtrinsicMetadata( - target=directory_swhid, - authority=metadata_authority, - fetcher=MetadataFetcher( - name="swh.loader.package.maven.loader.MavenLoader", - version=__version__, + release_swhid = CoreSWHID( + object_type=ObjectType.RELEASE, object_id=expected_release_id + ) + directory_swhid = ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=release.target + ) + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, + url=REPO_BASE_URL, + ) + + expected_metadata = [ + RawExtrinsicMetadata( + target=directory_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.maven.loader.MavenLoader", + version=__version__, + ), + discovery_date=loader.visit_date, + format="maven-pom", + metadata=expected_pom_metadata[i], + origin=MVN_ORIGIN_URL, + release=release_swhid, ), - discovery_date=loader.visit_date, - format="maven-pom", - metadata=_expected_pom_metadata.encode(), - origin=MVN_ARTIFACT_URLS[0], - release=release_swhid, - ), - RawExtrinsicMetadata( - target=directory_swhid, - authority=metadata_authority, - fetcher=MetadataFetcher( - name="swh.loader.package.maven.loader.MavenLoader", - version=__version__, + RawExtrinsicMetadata( + target=directory_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.maven.loader.MavenLoader", + version=__version__, + ), + discovery_date=loader.visit_date, + format="maven-json", + metadata=json.dumps(expected_json_metadata[i]).encode(), + origin=MVN_ORIGIN_URL, + release=release_swhid, ), - discovery_date=loader.visit_date, - format="maven-json", - metadata=json.dumps(_expected_json_metadata).encode(), - origin=MVN_ARTIFACT_URLS[0], - release=release_swhid, - ), - ] + ] - res = swh_storage.raw_extrinsic_metadata_get(directory_swhid, metadata_authority) - assert res.next_page_token is None - assert set(res.results) == set(expected_metadata) + res = swh_storage.raw_extrinsic_metadata_get( + directory_swhid, metadata_authority + ) + assert res.next_page_token is None + assert set(res.results) == set(expected_metadata) -def test_metadata_no_pom(swh_storage, requests_mock, data_jar_1): +def test_maven_loader_extrinsic_metadata_no_pom( + swh_storage, requests_mock, expected_releases, expected_json_metadata +): """With no prior visit, loading a jar ends up with 1 snapshot. Extrinsic metadata is None if the pom file cannot be retrieved. """ - artifact_url = MVN_ARTIFACT_URLS[0] - requests_mock.get(artifact_url, content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS_POM[0], status_code="404") - loader = MavenLoader(swh_storage, artifact_url, artifacts=[MVN_ARTIFACTS[0]]) + loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" - expected_release_id = hash_to_bytes(_expected_new_release_first_visit) + expected_release_id = expected_releases[0].id release = swh_storage.release_get([expected_release_id])[0] assert release is not None release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=expected_release_id ) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=REPO_BASE_URL, ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.maven.loader.MavenLoader", version=__version__, ), discovery_date=loader.visit_date, format="maven-pom", metadata=b"", - origin=artifact_url, + origin=MVN_ORIGIN_URL, release=release_swhid, ), RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.maven.loader.MavenLoader", version=__version__, ), discovery_date=loader.visit_date, format="maven-json", - metadata=json.dumps(_expected_json_metadata).encode(), - origin=artifact_url, + metadata=json.dumps(expected_json_metadata[0]).encode(), + origin=MVN_ORIGIN_URL, release=release_swhid, ), ] res = swh_storage.raw_extrinsic_metadata_get(directory_swhid, metadata_authority) assert res.next_page_token is None assert set(res.results) == set(expected_metadata) -def test_jar_extid(): +def test_maven_loader_jar_extid(): """Compute primary key should return the right identity""" - metadata = MVN_ARTIFACTS[0] - # metadata.pop("url", None) - url = MVN_ARTIFACT_URLS[0] - p_info = MavenPackageInfo(url=url, **metadata) + p_info = MavenPackageInfo(**metadata) - expected_manifest = "{gid} {aid} {version} {url} {time}".format( - url=url, **metadata - ).encode() + expected_manifest = "{gid} {aid} {version} {url} {time}".format(**metadata).encode() actual_id = p_info.extid() assert actual_id == ( "maven-jar", 0, hashlib.sha256(expected_manifest).digest(), ) - - -def test_jar_snapshot_append( - swh_storage, - requests_mock_datadir, - requests_mock, - data_jar_1, - data_pom_1, - data_jar_2, - data_pom_2, -): - - # first loading with a first artifact - artifact1 = MVN_ARTIFACTS[0] - url1 = MVN_ARTIFACT_URLS[0] - requests_mock.get(url1, content=data_jar_1) - requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) - loader = MavenLoader(swh_storage, url1, [artifact1]) - actual_load_status = loader.load() - assert actual_load_status["status"] == "eventful" - assert actual_load_status["snapshot_id"] is not None - assert_last_visit_matches(swh_storage, url1, status="full", type="maven") - - # check expected snapshot - snapshot = loader.last_snapshot() - assert len(snapshot.branches) == 2 - branch_artifact1_name = f"releases/{artifact1['version']}".encode() - assert b"HEAD" in snapshot.branches - assert branch_artifact1_name in snapshot.branches - assert snapshot.branches[b"HEAD"].target == branch_artifact1_name - - # second loading with a second artifact - artifact2 = MVN_ARTIFACTS[1] - url2 = MVN_ARTIFACT_URLS[1] - requests_mock.get(url2, content=data_jar_2) - requests_mock.get(MVN_ARTIFACTS_POM[1], content=data_pom_2) - loader = MavenLoader(swh_storage, url2, [artifact2]) - actual_load_status = loader.load() - assert actual_load_status["status"] == "eventful" - assert actual_load_status["snapshot_id"] is not None - assert_last_visit_matches(swh_storage, url2, status="full", type="maven") - - # check expected snapshot, should contain a new branch and the - # branch for the first artifact - snapshot = loader.last_snapshot() - assert len(snapshot.branches) == 2 - branch_artifact2_name = f"releases/{artifact2['version']}".encode() - assert b"HEAD" in snapshot.branches - assert branch_artifact2_name in snapshot.branches - assert branch_artifact1_name not in snapshot.branches - assert snapshot.branches[b"HEAD"].target == branch_artifact2_name