diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -7,6 +7,7 @@ import json import logging import os +import string from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote @@ -16,11 +17,9 @@ from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, - PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name -from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -38,18 +37,27 @@ EMPTY_PERSON = Person.from_fullname(b"") -EXTID_TYPE = "npm-archive-sha1" -EXTID_VERSION = 0 - - @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) + package_name = attr.ib(type=str) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" + # we cannot rely only on $shasum, as it is technically possible for two versions + # of the same package to have the exact same tarball. + # But the release data (message and date) are extrinsic to the content of the + # package, so they differ between versions. + # So we need every attribute used to build the release object to be part of the + # manifest. + MANIFEST_FORMAT = string.Template( + "date $date\nname $package_name\nshasum $shasum\n" "url $url\nversion $version" + ) + EXTID_TYPE = "npm-manifest-sha256" + EXTID_VERSION = 0 + @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str @@ -57,6 +65,8 @@ package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] + assert package_metadata["name"] == project_metadata["name"] + # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. @@ -70,6 +80,7 @@ date = None return cls( + package_name=package_metadata["name"], url=url, filename=os.path.basename(url), date=date, @@ -84,9 +95,6 @@ ], ) - def extid(self) -> PartialExtID: - return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.shasum)) - class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. @@ -144,12 +152,17 @@ def build_release( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: + # Metadata from NPM is not intrinsic to tarballs. + # This means two package versions can have the same tarball, but different + # metadata. To avoid mixing up releases, we every field used to build the + # release object must be part of NpmPackageInfo.MANIFEST_FORMAT. i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) + assert self.package_name == p_info.package_name msg = ( - f"Synthetic release for NPM source package {self.package_name} " + f"Synthetic release for NPM source package {p_info.package_name} " f"version {p_info.version}\n" ) diff --git a/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz new file mode 120000 --- /dev/null +++ b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz @@ -0,0 +1 @@ +org_-_org-0.0.3.tgz \ No newline at end of file diff --git a/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch new file mode 100644 --- /dev/null +++ b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch @@ -0,0 +1,141 @@ +{ + "_id": "org_version_mismatch", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org_version_mismatch", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.0.3" + }, + "versions": { + "0.0.3-beta": { + "name": "org_version_mismatch", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.3-beta", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.3-beta", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3-beta.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org_version_mismatch", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44Z", + "created": "2014-01-01T15:40:31Z", + "0.0.3-beta": "2014-01-01T15:40:33Z", + "0.0.3": "2014-01-01T15:55:45Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -482,12 +482,12 @@ @pytest.mark.usefixtures("requests_mock_datadir") def test_npm_loader_version_divergence(swh_storage): - package = "@aller_shared" + package = "@aller/shared" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("ebbe6397d0c2a6cf7cba40fa5b043c59dd4f2497") + expected_snapshot_id = hash_to_bytes("68eed3d3bc852e7f435a84f18ee77e23f6884be2") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), @@ -504,11 +504,11 @@ ), b"releases/0.1.0": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("04c66f3a82aa001e8f1b45246b58b82d2b0ca0df"), + target=hash_to_bytes("0c486b50b407f847ef7581f595c2b6c2062f1089"), ), b"releases/0.1.1-alpha.14": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("90cc04dc72193f3b1444f10e1c525bee2ea9dac6"), + target=hash_to_bytes("79d80c87c0a8d104a216cc539baad962a454802a"), ), }, ) @@ -528,6 +528,94 @@ } == stats +def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir): + """Test with two versions that have exactly the same tarball""" + package = "org_version_mismatch" + url = package_url(package) + loader = NpmLoader(swh_storage, url) + + actual_load_status = loader.load() + expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a") + assert actual_load_status == { + "status": "eventful", + "snapshot_id": expected_snapshot_id.hex(), + } + + assert_last_visit_matches( + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id + ) + + beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1" + release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0" + versions = [ + ("0.0.3-beta", beta_release_id), + ("0.0.3", release_id), + ] + + expected_snapshot = Snapshot( + id=expected_snapshot_id, + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/0.0.3", target_type=TargetType.ALIAS + ), + **{ + b"releases/" + + version_name.encode(): SnapshotBranch( + target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, + ) + for (version_name, version_id) in versions + }, + }, + ) + check_snapshot(expected_snapshot, swh_storage) + + assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release( + name=b"0.0.3-beta", + message=( + b"Synthetic release for NPM source package org_version_mismatch " + b"version 0.0.3-beta\n" + ), + target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b"Masafumi Oyamada "), + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) + ), + id=hash_to_bytes(beta_release_id), + ) + + assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( + name=b"0.0.3", + message=( + b"Synthetic release for NPM source package org_version_mismatch " + b"version 0.0.3\n" + ), + target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b"Masafumi Oyamada "), + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc) + ), + id=hash_to_bytes(release_id), + ) + + # Check incremental re-load keeps it unchanged + + loader = NpmLoader(swh_storage, url) + + actual_load_status = loader.load() + assert actual_load_status == { + "status": "uneventful", + "snapshot_id": expected_snapshot_id.hex(), + } + + assert_last_visit_matches( + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id + ) + + def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion