diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 2.12 -swh.model >= 4.4.0 +swh.model >= 6.5.1 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 swh.storage >= 0.29.0 diff --git a/swh/loader/package/arch/loader.py b/swh/loader/package/arch/loader.py --- a/swh/loader/package/arch/loader.py +++ b/swh/loader/package/arch/loader.py @@ -115,6 +115,7 @@ url=artifact["url"], version=version, last_modified=metadata["last_modified"], + checksums=artifact["checksums"], ) yield release_name(version, artifact["filename"]), p_info diff --git a/swh/loader/package/arch/tests/test_arch.py b/swh/loader/package/arch/tests/test_arch.py --- a/swh/loader/package/arch/tests/test_arch.py +++ b/swh/loader/package/arch/tests/test_arch.py @@ -2,6 +2,9 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + +# flake8: noqa: B950 + import pytest from swh.loader.package.arch.loader import ArchLoader @@ -22,16 +25,26 @@ "url": "https://archive.archlinux.org/packages/d/dialog/", "artifacts": [ { - "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", "version": "1:1.3_20190211-1", - "length": 180000, + "length": 440, "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", + "checksums": { + "length": 440, + "md5": "ce66c053ded0d51e5610368d85242684", + "sha256": "27c6a7af005cd2214fd63f7498bf51e3bff332df33a9b8f7ed07934823f7ba43", + }, }, { - "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950 + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", "version": "1:1.3_20220414-1", - "length": 198000, + "length": 371, "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + "checksums": { + "length": 371, + "md5": "5687f6bfc3b6975fdd073deb7075ec09", + "sha256": "b002d18d1e1f356410f73b08170f0bd52f0d83b37b71ccd938594e7d486c4e8a", + }, }, ], "arch_metadata": [ @@ -55,10 +68,15 @@ "url": "https://archlinuxarm.org/packages/aarch64/gzip", "artifacts": [ { - "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", # noqa: B950 - "length": 79640, + "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", + "length": 472, "version": "1.12-1", "filename": "gzip-1.12-1-aarch64.pkg.tar.xz", + "checksums": { + "length": 472, + "md5": "0b96fa72ae35c097ec78132ed2f05a57", + "sha256": "8d45b871283e2c37513833f6327ebcdd96c6c3b335588945f873cb809b1e6d2b", + }, } ], "arch_metadata": [ diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from __future__ import annotations + import datetime import hashlib import logging @@ -48,7 +50,7 @@ ) @classmethod - def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": + def from_metadata(cls, a_metadata: Dict[str, Any]) -> ArchivePackageInfo: url = a_metadata["url"] filename = a_metadata.get("filename") return cls( @@ -58,6 +60,7 @@ length=a_metadata["length"], time=a_metadata["time"], version=a_metadata["version"], + checksums={"length": a_metadata["length"]}, ) diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -596,7 +596,7 @@ { "time": 944729610, "url": url, - "length": 221837, + "length": 778240, "filename": filename, "version": "0.1.0", } diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -44,6 +44,7 @@ raw_info=a_metadata, name=a_metadata["package"], version=a_metadata["version"], + checksums=a_metadata.get("checksums", {}), ) diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -128,6 +128,12 @@ """:term:`extrinsic metadata` collected by the loader, that will be attached to the loaded directory and added to the Metadata storage.""" + checksums = attr.ib(type=Dict[str, str], default={}, kw_only=True) + """Dictionary holding package tarball checksums for integrity check after + download, keys are hash algorithm names and values are checksums in + hexadecimal format. The supported algorithms are defined in the + :data:`swh.model.hashutil.ALGORITHMS` set.""" + # TODO: add support for metadata for releases and contents def extid(self) -> Optional[PartialExtID]: @@ -410,7 +416,14 @@ """ try: - return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] + return [ + download( + p_info.url, + dest=tmpdir, + filename=p_info.filename, + hashes=p_info.checksums, + ) + ] except ContentDecodingError: # package might be erroneously marked as gzip compressed while is is not, # try to download its raw bytes again without attempting to uncompress @@ -420,6 +433,7 @@ p_info.url, dest=tmpdir, filename=p_info.filename, + hashes=p_info.checksums, extra_request_headers={"Accept-Encoding": "identity"}, ) ] diff --git a/swh/loader/package/maven/loader.py b/swh/loader/package/maven/loader.py --- a/swh/loader/package/maven/loader.py +++ b/swh/loader/package/maven/loader.py @@ -22,7 +22,7 @@ PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, release_name +from swh.loader.package.utils import EMPTY_AUTHOR, get_url_body, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -86,6 +86,11 @@ def from_metadata(cls, a_metadata: ArtifactDict) -> MavenPackageInfo: time = iso8601.parse_date(a_metadata["time"]).astimezone(tz=timezone.utc) url = a_metadata["url"] + checksums = {} + try: + checksums["sha1"] = get_url_body(url + ".sha1").decode() + except requests.HTTPError: + pass return cls( url=url, filename=a_metadata.get("filename") or path.split(url)[-1], @@ -100,6 +105,7 @@ metadata=json.dumps(a_metadata).encode(), ), ], + checksums=checksums, ) diff --git a/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar.sha1 b/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar.sha1 new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar.sha1 @@ -0,0 +1 @@ +6976e186000753610a63713677f42f0228f04e64 \ No newline at end of file diff --git a/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar.sha1 b/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar.sha1 new file mode 100644 --- /dev/null +++ b/swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar.sha1 @@ -0,0 +1 @@ +10c61786a119470096b8d1884e43d5880d99ec7e \ No newline at end of file diff --git a/swh/loader/package/maven/tests/test_maven.py b/swh/loader/package/maven/tests/test_maven.py --- a/swh/loader/package/maven/tests/test_maven.py +++ b/swh/loader/package/maven/tests/test_maven.py @@ -87,6 +87,14 @@ return content +@pytest.fixture +def data_jar_1_sha1(datadir): + content = Path( + datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar.sha1" + ).read_bytes() + return content + + @pytest.fixture def data_pom_1(datadir): content = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() @@ -101,6 +109,14 @@ return content +@pytest.fixture +def data_jar_2_sha1(datadir): + content = Path( + datadir, "https_maven.org", "sprova4j-0.1.1-sources.jar.sha1" + ).read_bytes() + return content + + @pytest.fixture def data_pom_2(datadir): content = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() @@ -195,13 +211,17 @@ def network_requests_mock( requests_mock, data_jar_1, + data_jar_1_sha1, data_pom_1, data_jar_2, + data_jar_2_sha1, data_pom_2, ): requests_mock.get(MVN_ARTIFACTS[0]["url"], content=data_jar_1) + requests_mock.get(MVN_ARTIFACTS[0]["url"] + ".sha1", content=data_jar_1_sha1) requests_mock.get(MVN_ARTIFACTS_POM[0], content=data_pom_1) requests_mock.get(MVN_ARTIFACTS[1]["url"], content=data_jar_2) + requests_mock.get(MVN_ARTIFACTS[1]["url"] + ".sha1", content=data_jar_2_sha1) requests_mock.get(MVN_ARTIFACTS_POM[1], content=data_pom_2) @@ -329,10 +349,14 @@ # the actual download of jar, and that they're correct. urls_history = [str(req.url) for req in list(requests_mock.request_history)] assert urls_history == [ + MVN_ARTIFACTS[0]["url"] + ".sha1", + MVN_ARTIFACTS[1]["url"] + ".sha1", MVN_ARTIFACTS[0]["url"], MVN_ARTIFACTS_POM[0], MVN_ARTIFACTS[1]["url"], MVN_ARTIFACTS_POM[1], + MVN_ARTIFACTS[0]["url"] + ".sha1", + MVN_ARTIFACTS[1]["url"] + ".sha1", ] diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -93,6 +93,7 @@ metadata=json.dumps(package_metadata).encode(), ) ], + checksums={"sha1": package_metadata["dist"]["shasum"]}, ) diff --git a/swh/loader/package/opam/loader.py b/swh/loader/package/opam/loader.py --- a/swh/loader/package/opam/loader.py +++ b/swh/loader/package/opam/loader.py @@ -217,6 +217,13 @@ f" {self.opam_package} (at url {self.origin.url}) from `opam show`" ) + checksums_str = self.get_enclosed_single_line_field("url.checksum:", version) + checksums = {} + if checksums_str: + for c in checksums_str.strip("[]").split(" "): + algo, hash = c.strip('"').split("=") + checksums[algo] = hash + authors_field = self.get_enclosed_single_line_field("authors:", version) fullname = b"" if authors_field is None else str.encode(authors_field) author = Person.from_fullname(fullname) @@ -241,6 +248,7 @@ format="opam-package-definition", ) ], + checksums=checksums, ) def build_release( diff --git a/swh/loader/package/opam/tests/test_opam.py b/swh/loader/package/opam/tests/test_opam.py --- a/swh/loader/package/opam/tests/test_opam.py +++ b/swh/loader/package/opam/tests/test_opam.py @@ -297,6 +297,13 @@ format="opam-package-definition", ) ], + checksums={ + "sha256": "aa27684fbda1b8036ae7e3c87de33a98a9cd2662bcc91c8447e00e41476b6a46", + "sha512": ( + "1260344f184dd8c8074b0439dbcc8a5d59550a654c249cd61913d4c150c664f" + "37b76195ddca38f7f6646d08bddb320ceb8d420508450b4f09a233cd5c22e6b9b" + ), + }, ) assert branch_name == expected_branch_name diff --git a/swh/loader/package/pubdev/loader.py b/swh/loader/package/pubdev/loader.py --- a/swh/loader/package/pubdev/loader.py +++ b/swh/loader/package/pubdev/loader.py @@ -154,6 +154,7 @@ last_modified=last_modified, author=author, description=description, + checksums={"sha256": v["archive_sha256"]}, ) yield release_name(version), p_info diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker --- a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker @@ -10,6 +10,7 @@ "author": "hackcave " }, "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "archive_sha256": "ca6149c2bb566b07beaf731930ade8b77fad86055b3f37b6eb2f17aca2fbc1b1", "published": "2014-12-24T22:34:02.534090Z" }, "versions": [ @@ -23,7 +24,8 @@ "author": "hackcave " }, "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "archive_sha256": "ca6149c2bb566b07beaf731930ade8b77fad86055b3f37b6eb2f17aca2fbc1b1", "published": "2014-12-24T22:34:02.534090Z" } ] -} +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication --- a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication @@ -33,7 +33,7 @@ } }, "archive_url": "https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz", - "archive_sha256": "0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99", + "archive_sha256": "bb8296bce47a5fe587b8d902ff87490593e8b86f736b38c6f9259c958b1f9b21", "published": "2020-08-13T04:53:34.134687Z" }, "versions": [ @@ -70,8 +70,8 @@ } }, "archive_url": "https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz", - "archive_sha256": "0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99", + "archive_sha256": "bb8296bce47a5fe587b8d902ff87490593e8b86f736b38c6f9259c958b1f9b21", "published": "2020-08-13T04:53:34.134687Z" } ] -} +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier --- a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier @@ -22,7 +22,7 @@ } }, "archive_url": "https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz", - "archive_sha256": "cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8", + "archive_sha256": "d8f2a8f75732e7f7c3c0295801c95970301536eee205d4532cb3bc1d720cb1bf", "published": "2019-12-22T03:17:30.805225Z" }, "versions": [ @@ -48,8 +48,8 @@ } }, "archive_url": "https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz", - "archive_sha256": "cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8", + "archive_sha256": "d8f2a8f75732e7f7c3c0295801c95970301536eee205d4532cb3bc1d720cb1bf", "published": "2019-12-22T03:17:30.805225Z" } ] -} +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf --- a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf @@ -28,6 +28,7 @@ } }, "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "archive_sha256": "b69a47f10620b5639bfcf51cd9acd1083e7e856dfc4a23f49df89445d1d27692", "published": "2022-07-25T11:38:25.983876Z" }, "versions": [ @@ -52,6 +53,7 @@ } }, "archive_url": "https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz", + "archive_sha256": "54f1b1c4d519c3bad61ca63b53b46e7e9eabc3b7fb9a4707525520215152e4e1", "published": "2018-07-16T21:12:28.894137Z" }, { @@ -82,7 +84,8 @@ } }, "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "archive_sha256": "b69a47f10620b5639bfcf51cd9acd1083e7e856dfc4a23f49df89445d1d27692", "published": "2022-07-25T11:38:25.983876Z" } ] -} +} \ No newline at end of file diff --git a/swh/loader/package/puppet/loader.py b/swh/loader/package/puppet/loader.py --- a/swh/loader/package/puppet/loader.py +++ b/swh/loader/package/puppet/loader.py @@ -115,6 +115,7 @@ url=url, version=version, last_modified=last_modified, + checksums=data["checksums"], ) yield release_name(version), p_info diff --git a/swh/loader/package/puppet/tests/test_puppet.py b/swh/loader/package/puppet/tests/test_puppet.py --- a/swh/loader/package/puppet/tests/test_puppet.py +++ b/swh/loader/package/puppet/tests/test_puppet.py @@ -24,12 +24,19 @@ "version": "1.0.0", "filename": "saz-memcached-1.0.0.tar.gz", "last_update": "2011-11-20T13:40:30-08:00", + "checksums": { + "length": 763, + }, }, "8.1.0": { "url": "https://forgeapi.puppet.com/v3/files/saz-memcached-8.1.0.tar.gz", # noqa: B950 "version": "8.1.0", "filename": "saz-memcached-8.1.0.tar.gz", "last_update": "2022-07-11T03:34:55-07:00", + "checksums": { + "md5": "5313e8fff0af08d63681daf955e7a604", + "sha256": "0dbb1470c64435700767e9887d0cf70203b1ae59445c401d5d200f2dabb3226e", # noqa: B950 + }, }, }, } diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,8 +1,10 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from __future__ import annotations + import json import logging import os @@ -55,7 +57,7 @@ @classmethod def from_metadata( cls, metadata: Dict[str, Any], name: str, version: str - ) -> "PyPIPackageInfo": + ) -> PyPIPackageInfo: return cls( url=metadata["url"], filename=metadata["filename"], @@ -71,6 +73,7 @@ metadata=json.dumps(metadata).encode(), ) ], + checksums={"sha256": metadata["digests"]["sha256"]}, ) def extid(self) -> PartialExtID: