diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -22,4 +22,5 @@ "swh.loader.package.npm.tasks", "swh.loader.package.pypi.tasks", "swh.loader.package.nixguix.tasks", + "swh.loader.package.maven.tasks", ] diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -56,6 +56,15 @@ - original author - ```` from SWORD XML - revisions had parents + * - maven-loader + - passed as arg + - HEAD + - ``release_name(version)`` + - "Synthetic release for archive at {p_info.url}\n" + - true + - "" + - passed as arg + - Only one artefact per url (jar/zip src) * - nixguix - URL - URL diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -63,6 +63,7 @@ loader.npm=swh.loader.package.npm:register loader.opam=swh.loader.package.opam:register loader.pypi=swh.loader.package.pypi:register + loader.maven=swh.loader.package.maven:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/loader/package/maven/__init__.py b/swh/loader/package/maven/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import MavenLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": MavenLoader, + } diff --git a/swh/loader/package/maven/loader.py b/swh/loader/package/maven/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/loader.py @@ -0,0 +1,231 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +import hashlib +import json +import logging +from os import path +import string +from typing import ( + Any, + Dict, + Iterator, + List, + Mapping, + Optional, + OrderedDict, + Sequence, + Tuple, +) +from urllib.parse import urlparse + +import attr +import iso8601 +import requests + +from swh.loader.package.loader import ( + BasePackageInfo, + PackageLoader, + PartialExtID, + RawExtrinsicMetadataCore, +) +from swh.loader.package.utils import EMPTY_AUTHOR, release_name +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + ObjectType, + RawExtrinsicMetadata, + Release, + Sha1Git, + TimestampWithTimezone, +) +from swh.storage.interface import StorageInterface + +logger = logging.getLogger(__name__) + + +@attr.s +class MavenPackageInfo(BasePackageInfo): + time = attr.ib(type=datetime) + """Timestamp of the last update of jar file on the server.""" + gid = attr.ib(type=str) + """Group ID of the maven artifact""" + aid = attr.ib(type=str) + """Artifact ID of the maven artifact""" + version = attr.ib(type=str) + """Version of the maven artifact""" + + # default format for maven artifacts + MANIFEST_FORMAT = string.Template("$gid $aid $version $url $time") + + def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID: + """Returns a unique intrinsic identifier of this package info + + ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" + manifest_format = manifest_format or self.MANIFEST_FORMAT + manifest = manifest_format.substitute( + { + "gid": self.gid, + "aid": self.aid, + "version": self.version, + "url": self.url, + "time": str(self.time), + } + ) + return ("maven-jar", hashlib.sha256(manifest.encode()).digest()) + + @classmethod + def from_metadata(cls, a_metadata: Dict[str, Any]) -> "MavenPackageInfo": + url = a_metadata["url"] + filename = a_metadata.get("filename") + time = iso8601.parse_date(a_metadata["time"]) + time = time.astimezone(tz=timezone.utc) + gid = a_metadata["gid"] + aid = a_metadata["aid"] + version = a_metadata["version"] + return cls( + url=url, + filename=filename or path.split(url)[-1], + time=time, + gid=gid, + aid=aid, + version=version, + directory_extrinsic_metadata=[ + RawExtrinsicMetadataCore( + format="maven-json", metadata=json.dumps(a_metadata).encode(), + ), + ], + ) + + +class MavenLoader(PackageLoader[MavenPackageInfo]): + """Load source code jar origin's artifact files into swh archive + + """ + + visit_type = "maven" + + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: Sequence[Dict[str, Any]], + extid_manifest_format: Optional[str] = None, + max_content_size: Optional[int] = None, + ): + f"""Loader constructor. + + For now, this is the lister's task output. + There is one, and only one, artefact (jar or zip) per version, as guaranteed by + the Maven coordinates system. + + Args: + url: Origin url + artifacts: List of single artifact information with keys: + + - **time**: the time of the last update of jar file on the server + as an iso8601 date string + + - **url**: the artifact url to retrieve filename + + - **filename**: optionally, the file's name + + - **gid**: artifact's groupId + + - **aid**: artifact's artifactId + + - **version**: artifact's version + + extid_manifest_format: template string used to format a manifest, + which is hashed to get the extid of a package. + Defaults to {MavenPackageInfo.MANIFEST_FORMAT!r} + + """ + super().__init__(storage=storage, url=url, max_content_size=max_content_size) + self.artifacts = artifacts # assume order is enforced in the lister + self.version_artifact: OrderedDict[str, Dict[str, Any]] + self.version_artifact = OrderedDict( + {str(jar["version"]): jar for jar in artifacts if jar["version"]} + ) + + def get_versions(self) -> Sequence[str]: + return list(self.version_artifact.keys()) + + def get_default_version(self) -> str: + # Default version is the last item + return self.artifacts[-1]["version"] + + def get_metadata_authority(self): + p_url = urlparse(self.url) + return MetadataAuthority( + type=MetadataAuthorityType.FORGE, + url=f"{p_url.scheme}://{p_url.netloc}/", + metadata={}, + ) + + def build_extrinsic_directory_metadata( + self, p_info: MavenPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, + ) -> List[RawExtrinsicMetadata]: + if not p_info.directory_extrinsic_metadata: + # If this package loader doesn't write metadata, no need to require + # an implementation for get_metadata_authority. + return [] + + # Get artifacts + dir_ext_metadata = p_info.directory_extrinsic_metadata[0] + a_metadata = json.loads(dir_ext_metadata.metadata) + aid = a_metadata["aid"] + version = a_metadata["version"] + + # Rebuild POM URL. + pom_url = path.dirname(p_info.url) + pom_url = f"{pom_url}/{aid}-{version}.pom" + + r = requests.get(pom_url, allow_redirects=True) + if r.status_code == 200: + metadata_pom = r.content + else: + metadata_pom = b"" + + return super().build_extrinsic_directory_metadata( + attr.evolve( + p_info, + directory_extrinsic_metadata=[ + RawExtrinsicMetadataCore( + format="maven-pom", metadata=metadata_pom, + ), + dir_ext_metadata, + ], + ), + release_id=release_id, + directory_id=directory_id, + ) + + def get_package_info(self, version: str) -> Iterator[Tuple[str, MavenPackageInfo]]: + a_metadata = self.version_artifact[version] + yield release_name(a_metadata["version"]), MavenPackageInfo.from_metadata( + a_metadata + ) + + def build_release( + self, p_info: MavenPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + msg = f"Synthetic release for archive at {p_info.url}\n".encode("utf-8") + # time is an iso8601 date + normalized_time = TimestampWithTimezone.from_datetime(p_info.time) + return Release( + name=p_info.version.encode(), + message=msg, + date=normalized_time, + author=EMPTY_AUTHOR, + target=directory, + target_type=ObjectType.DIRECTORY, + synthetic=True, + ) + + def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: + last_snapshot = self.last_snapshot() + return last_snapshot.to_dict()["branches"] if last_snapshot else {} diff --git a/swh/loader/package/maven/tasks.py b/swh/loader/package/maven/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/tasks.py @@ -0,0 +1,15 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.maven.loader import MavenLoader + + +@shared_task(name=__name__ + ".LoadMaven") +def load_jar_file(*, url=None, artifacts=None): + """Load jar's artifacts.""" + loader = MavenLoader.from_configfile(url=url, artifacts=artifacts) + return loader.load() diff --git a/swh/loader/package/maven/tests/__init__.py b/swh/loader/package/maven/tests/__init__.py new file mode 100644 diff --git a/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar b/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git:git://github.com/aldialimucaj/sprova4j.git + scm:git:git://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar b/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ + + 4.0.0 + al.aldi + sprova4j + 0.1.1 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.5 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.14.1 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/loader/package/maven/tests/test_maven.py b/swh/loader/package/maven/tests/test_maven.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/tests/test_maven.py @@ -0,0 +1,615 @@ +# Copyright (C) 2019-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib +import json +from pathlib import Path +import string + +import pytest + +from swh.loader.package import __version__ +from swh.loader.package.maven.loader import MavenLoader, MavenPackageInfo +from swh.loader.package.utils import EMPTY_AUTHOR +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + RawExtrinsicMetadata, + Release, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) +from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher +from swh.model.model import ObjectType as ModelObjectType +from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType +from swh.storage.algos.snapshot import snapshot_get_all_branches + +URL = "https://repo1.maven.org/maven2/" +MVN_ARTIFACTS = [ + { + "time": "2021-07-12 19:06:59.335000", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/" + + "sprova4j-0.1.0-sources.jar", + "gid": "al.aldi", + "aid": "sprova4j", + "filename": "sprova4j-0.1.0-sources.jar", + "version": "0.1.0", + }, + { + "time": "2021-07-12 19:37:05.534000", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.1/" + + "sprova4j-0.1.1-sources.jar", + "gid": "al.aldi", + "aid": "sprova4j", + "filename": "sprova4j-0.1.1-sources.jar", + "version": "0.1.1", + }, +] + +MVN_ARTIFACTS_POM = [ + "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom", + "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom", +] + +_expected_new_contents_first_visit = [ + "cd807364cd7730022b3849f90ccf4bababbada84", + "79e33dd52ebdf615e6696ae69add91cb990d81e2", + "8002bd514156f05a0940ae14ef86eb0179cbd510", + "23479553a6ccec30d377dee0496123a65d23fd8c", + "07ffbebb933bc1660e448f07d8196c2b083797f9", + "abf021b581f80035b56153c9aa27195b8d7ebbb8", + "eec70ba80a6862ed2619727663b17eb0d9dfe131", + "81a493dacb44dedf623f29ecf62c0e035bf698de", + "bda85ed0bbecf8cddfea04234bee16f476f64fe4", + "1ec91d561f5bdf59acb417086e04c54ead94e94e", + "d517b423da707fa21378623f35facebff53cb59d", + "3f0f21a764972d79e583908991c893c999613354", + "a2dd4d7dfe6043baf9619081e4e29966989211af", + "f62685cf0c6825a4097c949280b584cf0e16d047", + "56afc1ea60cef6548ce0a34f44e91b0e4b063835", + "cf7c740926e7ebc9ac8978a5c4f0e1e7a0e9e3af", + "86ff828bea1c22ca3d50ed82569b9c59ce2c41a1", + "1d0fa04454d9fec31d8ee3f35b58158ca1e28b15", + "e90239a2c8d9ede61a29671a8b397a743e18fa34", + "ce8851005d084aea089bcd8cf01052f4b234a823", + "2c34ce622aa7fa68d104900840f66671718e6249", + "e6a6fec32dcb3bee93c34fc11b0174a6b0b0ec6d", + "405d3e1be4b658bf26de37f2c90c597b2796b9d7", + "d0d2f5848721e04300e537826ef7d2d6d9441df0", + "399c67e33e38c475fd724d283dd340f6a2e8dc91", + "dea10c1111cc61ac1809fb7e88857e3db054959f", +] + +_expected_json_metadata = { + "time": "2021-07-12 19:06:59.335000", + "url": ( + "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/" + "sprova4j-0.1.0-sources.jar" + ), + "gid": "al.aldi", + "aid": "sprova4j", + "filename": "sprova4j-0.1.0-sources.jar", + "version": "0.1.0", +} +_expected_pom_metadata = ( + """ + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git:git://github.com/aldialimucaj/sprova4j.git + scm:git:git://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + +""" +) + +_expected_new_directories_first_visit = [ + "6c9de41e4cebb91a8368da1d89ae9873bd540ec3", + "c1a2ee97fc47426d0179f94d223405336b5cd075", + "9e1bdca292765a9528af18743bd793b80362c768", + "193a7af634592ef27fb341762806f61e8fb8eab3", + "a297aa21e3dbf138b370be3aae7a852dd403bbbb", + "da84026119ae04022f007d5b3362e98d46d09045", + "75bb915942a9c441ca62aeffc3b634f1ec9ce5e2", + "0851d359283b2ad82b116c8d1b55ab14b1ec219c", + "2bcbb8b723a025ee9a36b719cea229ed38c37e46", +] + +_expected_new_release_first_visit = "02e83c29ec094db581f939d2e238d0613a4f59ac" + +REL_MSG = ( + b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/" + b"sprova4j/0.1.0/sprova4j-0.1.0-sources.jar\n" +) + +REVISION_DATE = TimestampWithTimezone( + timestamp=Timestamp(seconds=1626116819, microseconds=335000), + offset=0, + negative_utc=False, +) + + +@pytest.fixture +def data_jar_1(datadir): + content = Path( + datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar" + ).read_bytes() + return content + + +@pytest.fixture +def data_pom_1(datadir): + content = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() + return content + + +@pytest.fixture +def data_jar_2(datadir): + content = Path( + datadir, "https_maven.org", "sprova4j-0.1.1-sources.jar" + ).read_bytes() + return content + + +@pytest.fixture +def data_pom_2(datadir): + content = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() + return content + + +def test_jar_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): + unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" + loader = MavenLoader( + swh_storage, + unknown_artifact_url, + artifacts=[ + { + "time": "2021-07-18 08:05:05.187000", + "url": unknown_artifact_url, # unknown artifact + "filename": "8sync-0.1.0.tar.gz", + "gid": "al/aldi", + "aid": "sprova4j", + "version": "0.1.0", + } + ], + ) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "uneventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e" + assert actual_load_status["snapshot_id"] == expected_snapshot_id + + stats = get_stats(swh_storage) + + assert_last_visit_matches( + swh_storage, unknown_artifact_url, status="partial", type="maven" + ) + + assert { + "content": 0, + "directory": 0, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + +def test_jar_visit_with_release_artifact_no_prior_visit( + swh_storage, requests_mock, data_jar_1, data_pom_1 +): + """With no prior visit, loading a jar ends up with 1 snapshot + + """ + requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) + loader = MavenLoader( + swh_storage, MVN_ARTIFACTS[0]["url"], artifacts=[MVN_ARTIFACTS[0]] + ) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + + expected_snapshot_first_visit_id = hash_to_bytes( + "c5195b8ebd148649bf094561877964b131ab27e0" + ) + + expected_snapshot = Snapshot( + id=expected_snapshot_first_visit_id, + branches={ + b"HEAD": SnapshotBranch( + target_type=TargetType.ALIAS, target=b"releases/0.1.0", + ), + b"releases/0.1.0": SnapshotBranch( + target_type=TargetType.RELEASE, + target=hash_to_bytes(_expected_new_release_first_visit), + ), + }, + ) + actual_snapshot = snapshot_get_all_branches( + swh_storage, hash_to_bytes(actual_load_status["snapshot_id"]) + ) + + assert actual_snapshot == expected_snapshot + check_snapshot(expected_snapshot, swh_storage) + + assert ( + hash_to_bytes(actual_load_status["snapshot_id"]) + == expected_snapshot_first_visit_id + ) + + stats = get_stats(swh_storage) + assert_last_visit_matches( + swh_storage, MVN_ARTIFACTS[0]["url"], status="full", type="maven" + ) + + expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) + assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] + + expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) + assert list(swh_storage.directory_missing(expected_dirs)) == [] + + expected_rels = map(hash_to_bytes, {_expected_new_release_first_visit}) + assert list(swh_storage.release_missing(expected_rels)) == [] + + rel_id = actual_snapshot.branches[b"releases/0.1.0"].target + (rel,) = swh_storage.release_get([rel_id]) + + assert rel == Release( + id=hash_to_bytes(_expected_new_release_first_visit), + name=b"0.1.0", + message=REL_MSG, + author=EMPTY_AUTHOR, + date=REVISION_DATE, + target_type=ModelObjectType.DIRECTORY, + target=hash_to_bytes("6c9de41e4cebb91a8368da1d89ae9873bd540ec3"), + synthetic=True, + metadata=None, + ) + + assert { + "content": len(_expected_new_contents_first_visit), + "directory": len(_expected_new_directories_first_visit), + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + +def test_jar_2_visits_without_change( + swh_storage, requests_mock_datadir, requests_mock, data_jar_2, data_pom_2 +): + """With no prior visit, load a gnu project ends up with 1 snapshot + + """ + requests_mock.get(MVN_ARTIFACTS[1]["url"], content=data_jar_2) + requests_mock.get(MVN_ARTIFACTS_POM[1], content=data_pom_2) + loader = MavenLoader( + swh_storage, MVN_ARTIFACTS[1]["url"], artifacts=[MVN_ARTIFACTS[1]] + ) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + + expected_snapshot_first_visit_id = hash_to_bytes( + "91dcacee7a6d2b54f9cab14bc14cb86d22d2ac2b" + ) + + assert ( + hash_to_bytes(actual_load_status["snapshot_id"]) + == expected_snapshot_first_visit_id + ) + + assert_last_visit_matches( + swh_storage, MVN_ARTIFACTS[1]["url"], status="full", type="maven" + ) + + actual_load_status2 = loader.load() + assert actual_load_status2["status"] == "uneventful" + assert actual_load_status2["snapshot_id"] is not None + assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] + + assert_last_visit_matches( + swh_storage, MVN_ARTIFACTS[1]["url"], status="full", type="maven" + ) + + # Make sure we have only one entry in history for the pom fetch, one for + # the actual download of jar, and that they're correct. + urls_history = [str(req.url) for req in list(requests_mock_datadir.request_history)] + assert urls_history == [ + MVN_ARTIFACTS[1]["url"], + MVN_ARTIFACTS_POM[1], + ] + + +def test_metadatata(swh_storage, requests_mock, data_jar_1, data_pom_1): + """With no prior visit, loading a jar ends up with 1 snapshot. + Extrinsic metadata is the pom file associated to the source jar. + """ + requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) + loader = MavenLoader( + swh_storage, MVN_ARTIFACTS[0]["url"], artifacts=[MVN_ARTIFACTS[0]] + ) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + + expected_release_id = hash_to_bytes(_expected_new_release_first_visit) + release = swh_storage.release_get([expected_release_id])[0] + assert release is not None + + release_swhid = CoreSWHID( + object_type=ObjectType.RELEASE, object_id=expected_release_id + ) + directory_swhid = ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=release.target + ) + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://repo1.maven.org/", + ) + + expected_metadata = [ + RawExtrinsicMetadata( + target=directory_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.maven.loader.MavenLoader", version=__version__, + ), + discovery_date=loader.visit_date, + format="maven-pom", + metadata=_expected_pom_metadata.encode(), + origin=MVN_ARTIFACTS[0]["url"], + release=release_swhid, + ), + RawExtrinsicMetadata( + target=directory_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.maven.loader.MavenLoader", version=__version__, + ), + discovery_date=loader.visit_date, + format="maven-json", + metadata=json.dumps(_expected_json_metadata).encode(), + origin=MVN_ARTIFACTS[0]["url"], + release=release_swhid, + ), + ] + + res = swh_storage.raw_extrinsic_metadata_get(directory_swhid, metadata_authority) + assert res.next_page_token is None + assert set(res.results) == set(expected_metadata) + + +def test_metadatata_no_pom(swh_storage, requests_mock, data_jar_1): + """With no prior visit, loading a jar ends up with 1 snapshot. + Extrinsic metadata is None if the pom file cannot be retrieved. + """ + requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS_POM[0], status_code="404") + loader = MavenLoader( + swh_storage, MVN_ARTIFACTS[0]["url"], artifacts=[MVN_ARTIFACTS[0]] + ) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + + expected_release_id = hash_to_bytes(_expected_new_release_first_visit) + release = swh_storage.release_get([expected_release_id])[0] + assert release is not None + + release_swhid = CoreSWHID( + object_type=ObjectType.RELEASE, object_id=expected_release_id + ) + directory_swhid = ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=release.target + ) + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://repo1.maven.org/", + ) + + expected_metadata = [ + RawExtrinsicMetadata( + target=directory_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.maven.loader.MavenLoader", version=__version__, + ), + discovery_date=loader.visit_date, + format="maven-pom", + metadata=b"", + origin=MVN_ARTIFACTS[0]["url"], + release=release_swhid, + ), + RawExtrinsicMetadata( + target=directory_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.maven.loader.MavenLoader", version=__version__, + ), + discovery_date=loader.visit_date, + format="maven-json", + metadata=json.dumps(_expected_json_metadata).encode(), + origin=MVN_ARTIFACTS[0]["url"], + release=release_swhid, + ), + ] + res = swh_storage.raw_extrinsic_metadata_get(directory_swhid, metadata_authority) + assert res.next_page_token is None + assert set(res.results) == set(expected_metadata) + + +def test_jar_extid(): + """Compute primary key should return the right identity + + """ + + metadata = MVN_ARTIFACTS[0] + + p_info = MavenPackageInfo(**metadata) + + expected_manifest = ( + b"al.aldi sprova4j 0.1.0 " + b"https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/sprova4j-0.1.0" + b"-sources.jar 1626109619335" + ) + for manifest_format in [ + string.Template("$aid $gid $version"), + string.Template("$gid $aid"), + string.Template("$gid $aid $version"), + ]: + actual_id = p_info.extid(manifest_format=manifest_format) + assert actual_id != ("maven-jar", hashlib.sha256(expected_manifest).digest(),) + + for manifest_format, expected_manifest in [ + (None, "{gid} {aid} {version} {url} {time}".format(**metadata).encode()), + ]: + actual_id = p_info.extid(manifest_format=manifest_format) + assert actual_id == ("maven-jar", hashlib.sha256(expected_manifest).digest(),) + + with pytest.raises(KeyError): + p_info.extid(manifest_format=string.Template("$a $unknown_key")) + + +def test_jar_snapshot_append( + swh_storage, + requests_mock_datadir, + requests_mock, + data_jar_1, + data_pom_1, + data_jar_2, + data_pom_2, +): + + # first loading with a first artifact + artifact1 = MVN_ARTIFACTS[0] + url1 = artifact1["url"] + requests_mock.get(url1, content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) + loader = MavenLoader(swh_storage, url1, [artifact1]) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + assert_last_visit_matches(swh_storage, url1, status="full", type="maven") + + # check expected snapshot + snapshot = loader.last_snapshot() + assert len(snapshot.branches) == 2 + branch_artifact1_name = f"releases/{artifact1['version']}".encode() + assert b"HEAD" in snapshot.branches + assert branch_artifact1_name in snapshot.branches + assert snapshot.branches[b"HEAD"].target == branch_artifact1_name + + # second loading with a second artifact + artifact2 = MVN_ARTIFACTS[1] + url2 = artifact2["url"] + requests_mock.get(url2, content=data_jar_2) + requests_mock.get(MVN_ARTIFACTS_POM[1], content=data_pom_2) + loader = MavenLoader(swh_storage, url2, [artifact2]) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + assert_last_visit_matches(swh_storage, url2, status="full", type="maven") + + # check expected snapshot, should contain a new branch and the + # branch for the first artifact + snapshot = loader.last_snapshot() + assert len(snapshot.branches) == 2 + branch_artifact2_name = f"releases/{artifact2['version']}".encode() + assert b"HEAD" in snapshot.branches + assert branch_artifact2_name in snapshot.branches + assert branch_artifact1_name not in snapshot.branches + assert snapshot.branches[b"HEAD"].target == branch_artifact2_name diff --git a/swh/loader/package/maven/tests/test_tasks.py b/swh/loader/package/maven/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/tests/test_tasks.py @@ -0,0 +1,50 @@ +# Copyright (C) 2019-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +MVN_ARTIFACTS = [ + { + "time": 1626109619335, + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j/0.1.0/" + + "sprova4j-0.1.0.jar", + "gid": "al.aldi", + "aid": "sprova4j", + "filename": "sprova4j-0.1.0.jar", + "version": "0.1.0", + }, +] + + +def test_tasks_jar_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.maven.loader.MavenLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.maven.tasks.LoadMaven", + kwargs=dict(url=MVN_ARTIFACTS[0]["url"], artifacts=MVN_ARTIFACTS,), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} + + +def test_tasks_jar_loader_snapshot_append( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.maven.loader.MavenLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.maven.tasks.LoadMaven", + kwargs=dict(url=MVN_ARTIFACTS[0]["url"], artifacts=[]), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"}