diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -7,6 +7,7 @@ import json import logging import os +import string from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote @@ -16,11 +17,9 @@ from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, - PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name -from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -38,18 +37,25 @@ EMPTY_PERSON = Person.from_fullname(b"") -EXTID_TYPE = "npm-archive-sha1" -EXTID_VERSION = 0 - - @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) + id_ = attr.ib(type=str) + """Unique id assigned by the registry for this version.""" + date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" + # we cannot rely only on $shasum, as it is technically possible for two versions + # of the same package to have the exact same tarball. + # But the release data (message and date) are extrinsic to the content of the + # package, so they differ between versions. + MANIFEST_FORMAT = string.Template("$id_ $shasum") + EXTID_TYPE = "npm-archive-url-and-sha1" + EXTID_VERSION = 0 + @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str @@ -70,6 +76,7 @@ date = None return cls( + id_=package_metadata["_id"], url=url, filename=os.path.basename(url), date=date, @@ -84,9 +91,6 @@ ], ) - def extid(self) -> PartialExtID: - return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.shasum)) - class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. diff --git a/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz new file mode 120000 --- /dev/null +++ b/swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz @@ -0,0 +1 @@ +org_-_org-0.0.3.tgz \ No newline at end of file diff --git a/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch new file mode 100644 --- /dev/null +++ b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch @@ -0,0 +1,141 @@ +{ + "_id": "org", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.0.3" + }, + "versions": { + "0.0.3-beta": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.3-beta", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.3-beta", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3-beta.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44Z", + "created": "2014-01-01T15:40:31Z", + "0.0.3-beta": "2014-01-01T15:40:33Z", + "0.0.3": "2014-01-01T15:55:45Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -528,6 +528,94 @@ } == stats +def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir): + """Test with two versions that have exactly the same tarball""" + package = "org_version_mismatch" + url = package_url(package) + loader = NpmLoader(swh_storage, url) + + actual_load_status = loader.load() + expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a") + assert actual_load_status == { + "status": "eventful", + "snapshot_id": expected_snapshot_id.hex(), + } + + assert_last_visit_matches( + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id + ) + + beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1" + release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0" + versions = [ + ("0.0.3-beta", beta_release_id), + ("0.0.3", release_id), + ] + + expected_snapshot = Snapshot( + id=expected_snapshot_id, + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/0.0.3", target_type=TargetType.ALIAS + ), + **{ + b"releases/" + + version_name.encode(): SnapshotBranch( + target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, + ) + for (version_name, version_id) in versions + }, + }, + ) + check_snapshot(expected_snapshot, swh_storage) + + assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release( + name=b"0.0.3-beta", + message=( + b"Synthetic release for NPM source package org_version_mismatch " + b"version 0.0.3-beta\n" + ), + target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b"Masafumi Oyamada "), + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) + ), + id=hash_to_bytes(beta_release_id), + ) + + assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( + name=b"0.0.3", + message=( + b"Synthetic release for NPM source package org_version_mismatch " + b"version 0.0.3\n" + ), + target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b"Masafumi Oyamada "), + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc) + ), + id=hash_to_bytes(release_id), + ) + + # Check incremental re-load keeps it unchanged + + loader = NpmLoader(swh_storage, url) + + actual_load_status = loader.load() + assert actual_load_status == { + "status": "uneventful", + "snapshot_id": expected_snapshot_id.hex(), + } + + assert_last_visit_matches( + swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id + ) + + def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion